962 files changed, 70437 insertions, 27210 deletions
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index c363b58..b82c2f1 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+bf16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16,CHECK-BF16 %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -1237,28 +1238,51 @@ define void @fp16cast() {
 }
 
 define void @bf16cast() {
-; CHECK-LABEL: 'bf16cast'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f64 = fpext bfloat undef to double
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f32 = fptrunc float undef to bfloat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f64 = fptrunc double undef to bfloat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NOFP16-LABEL: 'bf16cast'
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extf16f64 = fpext bfloat undef to double
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncf16f32 = fptrunc float undef to bfloat
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %truncf16f64 = fptrunc double undef to bfloat
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-BF16-LABEL: 'bf16cast'
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extf16f64 = fpext bfloat undef to double
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f32 = fptrunc float undef to bfloat
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f64 = fptrunc double undef to bfloat
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %extf16f32 = fpext bfloat undef to float
   %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll
index ada0be6..ef52d0d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/div.ll
@@ -11,14 +11,20 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
@@ -32,16 +38,22 @@ define i32 @sdiv() {
   %V8i64 = sdiv <8 x i64> undef, undef
 
   %I32 = sdiv i32 undef, undef
+  %V2i32 = sdiv <2 x i32> undef, undef
   %V4i32 = sdiv <4 x i32> undef, undef
   %V8i32 = sdiv <8 x i32> undef, undef
   %V16i32 = sdiv <16 x i32> undef, undef
 
   %I16 = sdiv i16 undef, undef
+  %V2i16 = sdiv <2 x i16> undef, undef
+  %V4i16 = sdiv <4 x i16> undef, undef
   %V8i16 = sdiv <8 x i16> undef, undef
   %V16i16 = sdiv <16 x i16> undef, undef
   %V32i16 = sdiv <32 x i16> undef, undef
 
   %I8 = sdiv i8 undef, undef
+  %V2i8 = sdiv <2 x i8> undef, undef
+  %V4i8 = sdiv <4 x i8> undef, undef
+  %V8i8 = sdiv <8 x i8> undef, undef
   %V16i8 = sdiv <16 x i8> undef, undef
   %V32i8 = sdiv <32 x i8> undef, undef
   %V64i8 = sdiv <64 x i8> undef, undef
@@ -57,14 +69,20 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef
@@ -78,16 +96,22 @@ define i32 @udiv() {
   %V8i64 = udiv <8 x i64> undef, undef
 
   %I32 = udiv i32 undef, undef
+  %V2i32 = udiv <2 x i32> undef, undef
   %V4i32 = udiv <4 x i32> undef, undef
   %V8i32 = udiv <8 x i32> undef, undef
   %V16i32 = udiv <16 x i32> undef, undef
 
   %I16 = udiv i16 undef, undef
+  %V2i16 = udiv <2 x i16> undef, undef
+  %V4i16 = udiv <4 x i16> undef, undef
   %V8i16 = udiv <8 x i16> undef, undef
   %V16i16 = udiv <16 x i16> undef, undef
   %V32i16 = udiv <32 x i16> undef, undef
 
   %I8 = udiv i8 undef, undef
+  %V2i8 = udiv <2 x i8> undef, undef
+  %V4i8 = udiv <4 x i8> undef, undef
+  %V8i8 = udiv <8 x i8> undef, undef
   %V16i8 = udiv <16 x i8> undef, undef
   %V32i8 = udiv <32 x i8> undef, undef
   %V64i8 = udiv <64 x i8> undef, undef
@@ -103,14 +127,20 @@ define i32 @sdiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -124,16 +154,22 @@ define i32 @sdiv_const() {
   %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = sdiv i32 undef, 7
+  %V2i32 = sdiv <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = sdiv i16 undef, 7
+  %V2i16 = sdiv <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = sdiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = sdiv i8 undef, 7
+  %V2i8 = sdiv <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = sdiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = sdiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -149,14 +185,20 @@ define i32 @udiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -171,16 +213,22 @@ define i32 @udiv_const() {
   %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = udiv i32 undef, 7
+  %V2i32 = udiv <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = udiv i16 undef, 7
+  %V2i16 = udiv <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = udiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = udiv i8 undef, 7
+  %V2i8 = udiv <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = udiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = udiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -196,14 +244,20 @@ define i32 @sdiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 7)
@@ -217,16 +271,22 @@ define i32 @sdiv_uniformconst() {
   %V8i64 = sdiv <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = sdiv i32 undef, 7
+  %V2i32 = sdiv <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = sdiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = sdiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = sdiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = sdiv i16 undef, 7
+  %V2i16 = sdiv <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = sdiv <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = sdiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = sdiv i8 undef, 7
+  %V2i8 = sdiv <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = sdiv <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = sdiv <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -242,14 +302,20 @@ define i32 @udiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 7)
@@ -263,16 +329,22 @@ define i32 @udiv_uniformconst() {
   %V8i64 = udiv <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = udiv i32 undef, 7
+  %V2i32 = udiv <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = udiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = udiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = udiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = udiv i16 undef, 7
+  %V2i16 = udiv <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = udiv <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = udiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = udiv i8 undef, 7
+  %V2i8 = udiv <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = udiv <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = udiv <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -288,14 +360,20 @@ define i32 @sdiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -309,16 +387,22 @@ define i32 @sdiv_constpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = sdiv i32 undef, 16
+  %V2i32 = sdiv <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = sdiv i16 undef, 16
+  %V2i16 = sdiv <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = sdiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = sdiv i8 undef, 16
+  %V2i8 = sdiv <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = sdiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -334,14 +418,20 @@ define i32 @udiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -355,16 +445,22 @@ define i32 @udiv_constpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = udiv i32 undef, 16
+  %V2i32 = udiv <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = udiv i16 undef, 16
+  %V2i16 = udiv <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = udiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = udiv i8 undef, 16
+  %V2i8 = udiv <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = udiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = udiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -380,14 +476,20 @@ define i32 @sdiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 16)
@@ -401,16 +503,22 @@ define i32 @sdiv_uniformconstpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = sdiv i32 undef, 16
+  %V2i32 = sdiv <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = sdiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = sdiv i16 undef, 16
+  %V2i16 = sdiv <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = sdiv <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = sdiv i8 undef, 16
+  %V2i8 = sdiv <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = sdiv <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = sdiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -426,14 +534,20 @@ define i32 @udiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 16)
@@ -447,16 +561,22 @@ define i32 @udiv_uniformconstpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = udiv i32 undef, 16
+  %V2i32 = udiv <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = udiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = udiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = udiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = udiv i16 undef, 16
+  %V2i16 = udiv <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = udiv <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = udiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = udiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = udiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = udiv i8 undef, 16
+  %V2i8 = udiv <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = udiv <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = udiv <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = udiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = udiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = udiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -472,14 +592,20 @@ define i32 @sdiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -493,16 +619,22 @@ define i32 @sdiv_constnegpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = sdiv i32 undef, -16
+  %V2i32 = sdiv <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = sdiv i16 undef, -16
+  %V2i16 = sdiv <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = sdiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = sdiv i8 undef, -16
+  %V2i8 = sdiv <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = sdiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -518,14 +650,20 @@ define i32 @udiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -539,16 +677,22 @@ define i32 @udiv_constnegpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = udiv i32 undef, -16
+  %V2i32 = udiv <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = udiv i16 undef, -16
+  %V2i16 = udiv <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = udiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = udiv i8 undef, -16
+  %V2i8 = udiv <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = udiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = udiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -564,14 +708,20 @@ define i32 @sdiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
@@ -585,16 +735,22 @@ define i32 @sdiv_uniformconstnegpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = sdiv i32 undef, -16
+  %V2i32 = sdiv <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = sdiv <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = sdiv <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = sdiv <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = sdiv i16 undef, -16
+  %V2i16 = sdiv <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = sdiv <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = sdiv <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = sdiv <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = sdiv <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = sdiv i8 undef, -16
+  %V2i8 = sdiv <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = sdiv <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = sdiv <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = sdiv <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = sdiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
@@ -610,14 +766,20 @@ define i32 @udiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 -16)
@@ -631,16 +793,22 @@ define i32 @udiv_uniformconstnegpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = udiv i32 undef, -16
+  %V2i32 = udiv <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = udiv <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = udiv <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = udiv <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = udiv i16 undef, -16
+  %V2i16 = udiv <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = udiv <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = udiv <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = udiv <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = udiv <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = udiv i8 undef, -16
+  %V2i8 = udiv <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = udiv <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = udiv <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = udiv <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = udiv <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = udiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
diff --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll
index 2f1e8c8..06c05ae 100644
--- a/llvm/test/Analysis/CostModel/AArch64/rem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll
@@ -5,40 +5,55 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i32 @srem() {
 ; CHECK-LABEL: 'srem'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, undef
+
   %I64 = srem i64 undef, undef
   %V2i64 = srem <2 x i64> undef, undef
   %V4i64 = srem <4 x i64> undef, undef
   %V8i64 = srem <8 x i64> undef, undef
 
   %I32 = srem i32 undef, undef
+  %V2i32 = srem <2 x i32> undef, undef
   %V4i32 = srem <4 x i32> undef, undef
   %V8i32 = srem <8 x i32> undef, undef
   %V16i32 = srem <16 x i32> undef, undef
 
   %I16 = srem i16 undef, undef
+  %V2i16 = srem <2 x i16> undef, undef
+  %V4i16 = srem <4 x i16> undef, undef
   %V8i16 = srem <8 x i16> undef, undef
   %V16i16 = srem <16 x i16> undef, undef
   %V32i16 = srem <32 x i16> undef, undef
 
   %I8 = srem i8 undef, undef
+  %V2i8 = srem <2 x i8> undef, undef
+  %V4i8 = srem <4 x i8> undef, undef
+  %V8i8 = srem <8 x i8> undef, undef
   %V16i8 = srem <16 x i8> undef, undef
   %V32i8 = srem <32 x i8> undef, undef
   %V64i8 = srem <64 x i8> undef, undef
@@ -48,40 +63,55 @@ define i32 @srem() {
 
 define i32 @urem() {
 ; CHECK-LABEL: 'urem'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, undef
+
   %I64 = urem i64 undef, undef
   %V2i64 = urem <2 x i64> undef, undef
   %V4i64 = urem <4 x i64> undef, undef
   %V8i64 = urem <8 x i64> undef, undef
 
   %I32 = urem i32 undef, undef
+  %V2i32 = urem <2 x i32> undef, undef
   %V4i32 = urem <4 x i32> undef, undef
   %V8i32 = urem <8 x i32> undef, undef
   %V16i32 = urem <16 x i32> undef, undef
 
   %I16 = urem i16 undef, undef
+  %V2i16 = urem <2 x i16> undef, undef
+  %V4i16 = urem <4 x i16> undef, undef
   %V8i16 = urem <8 x i16> undef, undef
   %V16i16 = urem <16 x i16> undef, undef
   %V32i16 = urem <32 x i16> undef, undef
 
   %I8 = urem i8 undef, undef
+  %V2i8 = urem <2 x i8> undef, undef
+  %V4i8 = urem <4 x i8> undef, undef
+  %V8i8 = urem <8 x i8> undef, undef
   %V16i8 = urem <16 x i8> undef, undef
   %V32i8 = urem <32 x i8> undef, undef
   %V64i8 = urem <64 x i8> undef, undef
@@ -91,40 +121,55 @@ define i32 @urem() {
 
 define i32 @srem_const() {
 ; CHECK-LABEL: 'srem_const'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 7
+
   %I64 = srem i64 undef, 7
   %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
   %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
   %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = srem i32 undef, 7
+  %V2i32 = srem <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = srem i16 undef, 7
+  %V2i16 = srem <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = srem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = srem i8 undef, 7
+  %V2i8 = srem <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = srem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = srem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -134,40 +179,56 @@ define i32 @srem_const() {
 
 define i32 @urem_const() {
 ; CHECK-LABEL: 'urem_const'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+
+  %I128 = urem i128 undef, 7
+
   %I64 = urem i64 undef, 7
   %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
   %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
   %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = urem i32 undef, 7
+  %V2i32 = urem <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = urem i16 undef, 7
+  %V2i16 = urem <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = urem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = urem i8 undef, 7
+  %V2i8 = urem <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = urem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = urem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -177,40 +238,55 @@ define i32 @urem_const() {
 
 define i32 @srem_uniformconst() {
 ; CHECK-LABEL: 'srem_uniformconst'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 7
+
   %I64 = srem i64 undef, 7
   %V2i64 = srem <2 x i64> undef, <i64 7, i64 7>
   %V4i64 = srem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
   %V8i64 = srem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = srem i32 undef, 7
+  %V2i32 = srem <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = srem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = srem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = srem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = srem i16 undef, 7
+  %V2i16 = srem <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = srem <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = srem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = srem i8 undef, 7
+  %V2i8 = srem <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = srem <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = srem <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -220,40 +296,55 @@ define i32 @srem_uniformconst() {
 
 define i32 @urem_uniformconst() {
 ; CHECK-LABEL: 'urem_uniformconst'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 7
+
   %I64 = urem i64 undef, 7
   %V2i64 = urem <2 x i64> undef, <i64 7, i64 7>
   %V4i64 = urem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
   %V8i64 = urem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = urem i32 undef, 7
+  %V2i32 = urem <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = urem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = urem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = urem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = urem i16 undef, 7
+  %V2i16 = urem <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = urem <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = urem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = urem i8 undef, 7
+  %V2i8 = urem <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = urem <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = urem <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -263,40 +354,55 @@ define i32 @urem_uniformconst() {
 
 define i32 @srem_constpow2() {
 ; CHECK-LABEL: 'srem_constpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 16
+
   %I64 = srem i64 undef, 16
   %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
   %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
   %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = srem i32 undef, 16
+  %V2i32 = srem <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = srem i16 undef, 16
+  %V2i16 = srem <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = srem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = srem i8 undef, 16
+  %V2i8 = srem <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = srem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = srem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -306,40 +412,55 @@ define i32 @srem_constpow2() {
 
 define i32 @urem_constpow2() {
 ; CHECK-LABEL: 'urem_constpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 16
+
   %I64 = urem i64 undef, 16
   %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
   %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
   %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = urem i32 undef, 16
+  %V2i32 = urem <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = urem i16 undef, 16
+  %V2i16 = urem <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = urem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = urem i8 undef, 16
+  %V2i8 = urem <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = urem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = urem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -349,40 +470,55 @@ define i32 @urem_constpow2() {
 
 define i32 @srem_uniformconstpow2() {
 ; CHECK-LABEL: 'srem_uniformconstpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 16
+
   %I64 = srem i64 undef, 16
   %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
   %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
   %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = srem i32 undef, 16
+  %V2i32 = srem <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = srem i16 undef, 16
+  %V2i16 = srem <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = srem <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = srem i8 undef, 16
+  %V2i8 = srem <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = srem <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = srem <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -392,40 +528,55 @@ define i32 @srem_uniformconstpow2() {
 
 define i32 @urem_uniformconstpow2() {
 ; CHECK-LABEL: 'urem_uniformconstpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 16
+
   %I64 = urem i64 undef, 16
   %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
   %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
   %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = urem i32 undef, 16
+  %V2i32 = urem <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = urem i16 undef, 16
+  %V2i16 = urem <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = urem <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = urem i8 undef, 16
+  %V2i8 = urem <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = urem <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = urem <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -435,40 +586,55 @@ define i32 @urem_uniformconstpow2() {
 
 define i32 @srem_constnegpow2() {
 ; CHECK-LABEL: 'srem_constnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, -16
+
   %I64 = srem i64 undef, -16
   %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
   %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
   %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = srem i32 undef, -16
+  %V2i32 = srem <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = srem i16 undef, -16
+  %V2i16 = srem <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = srem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = srem i8 undef, -16
+  %V2i8 = srem <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = srem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = srem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -478,40 +644,55 @@ define i32 @srem_constnegpow2() {
 
 define i32 @urem_constnegpow2() {
 ; CHECK-LABEL: 'urem_constnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, -16
+
   %I64 = urem i64 undef, -16
   %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
   %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
   %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = urem i32 undef, -16
+  %V2i32 = urem <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = urem i16 undef, -16
+  %V2i16 = urem <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = urem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = urem i8 undef, -16
+  %V2i8 = urem <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = urem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = urem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -521,40 +702,55 @@ define i32 @urem_constnegpow2() {
 
 define i32 @srem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'srem_uniformconstnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, -16
+
   %I64 = srem i64 undef, -16
   %V2i64 = srem <2 x i64> undef, <i64 -16, i64 -16>
   %V4i64 = srem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
   %V8i64 = srem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = srem i32 undef, -16
+  %V2i32 = srem <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = srem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = srem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = srem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = srem i16 undef, -16
+  %V2i16 = srem <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = srem <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = srem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = srem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = srem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = srem i8 undef, -16
+  %V2i8 = srem <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = srem <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = srem <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = srem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = srem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = srem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
@@ -564,40 +760,55 @@ define i32 @srem_uniformconstnegpow2() {
 
 define i32 @urem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'urem_uniformconstnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, -16
+
   %I64 = urem i64 undef, -16
   %V2i64 = urem <2 x i64> undef, <i64 -16, i64 -16>
   %V4i64 = urem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
   %V8i64 = urem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = urem i32 undef, -16
+  %V2i32 = urem <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = urem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = urem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = urem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = urem i16 undef, -16
+  %V2i16 = urem <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = urem <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = urem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = urem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = urem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = urem i8 undef, -16
+  %V2i8 = urem <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = urem <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = urem <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = urem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = urem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = urem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
new file mode 100644
index 0000000..b81b6a9
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @extract_half() {
+; CHECK-LABEL: 'extract_half'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i8_lo = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_hi = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8_lo = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 0>
+  %v2i8_hi = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 1>
+  %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 1, i32 2>
+  %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+  %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 0>
+  %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 1>
+  %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+  %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 1, i32 2>
+  %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 0>
+  %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 1>
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 0>
+  %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  ret void
+}
+
+define void @extract_qtr() {
+; CHECK-LABEL: 'extract_qtr'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 0>
+  %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 1>
+  %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 2>
+  %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 0, i32 1>
+  %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 2, i32 3>
+  %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 4, i32 5>
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 0>
+  %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 1>
+  %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 2>
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 0>
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 0>
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
index 075397a..9c573c7 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -1,97 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=COST
-; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
-; COST-LABEL: sel.v8i8
-; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
-; CODE-LABEL: sel.v8i8
-; CODE:       tbl v0.8b, { v0.16b }, v1.8b
-define <8 x i8> @sel.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+; COST-LABEL: 'sel_v8i8'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %tmp0
+;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i8> %tmp0
 }
 
-; COST-LABEL: sel.v16i8
-; COST:       Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-; CODE-LABEL: sel.v16i8
-; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-define <16 x i8> @sel.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+; COST-LABEL: 'sel_v16i8'
+; COST-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %tmp0
+;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   ret <16 x i8> %tmp0
 }
 
-; COST-LABEL: sel.v4i16
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CODE-LABEL: sel.v4i16
-; CODE:       rev32 v0.4h, v0.4h
-; CODE:       trn2 v0.4h, v0.4h, v1.4h
-define <4 x i16> @sel.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+; COST-LABEL: 'sel_v4i16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %tmp0
+;
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x i16> %tmp0
 }
 
-; COST-LABEL: sel.v8i16
-; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
-; CODE-LABEL: sel.v8i16
-; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-define <8 x i16> @sel.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+; COST-LABEL: 'sel_v8i16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %tmp0
+;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i16> %tmp0
 }
 
-; COST-LABEL: sel.v2i32
-; COST:        Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 3>
-; CODE-LABEL: sel.v2i32
-; CODE:       mov v0.s[1], v1.s[1]
-define <2 x i32> @sel.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+define <2 x i32> @sel_v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+; COST-LABEL: 'sel_v2i32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 3>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %tmp0
+;
   %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 3>
   ret <2 x i32> %tmp0
 }
 
-; COST-LABEL: sel.v4i32
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CODE-LABEL: sel.v4i32
-; CODE:       rev64 v0.4s, v0.4s
-; CODE:       trn2 v0.4s, v0.4s, v1.4s
-define <4 x i32> @sel.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+define <4 x i32> @sel_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; COST-LABEL: 'sel_v4i32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %tmp0
+;
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x i32> %tmp0
 }
 
-; COST-LABEL: sel.v2i64
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 3>
-; CODE-LABEL: sel.v2i64
-; CODE:       mov v0.d[1], v1.d[1]
-define <2 x i64> @sel.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+define <2 x i64> @sel_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; COST-LABEL: 'sel_v2i64'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 3>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %tmp0
+;
   %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %tmp0
 }
 
-; COST-LABEL: sel.v2f32
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 3>
-; CODE-LABEL: sel.v2f32
-; CODE:       mov v0.s[1], v1.s[1]
-define <2 x float> @sel.v2f32(<2 x float> %v0, <2 x float> %v1) {
+define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
+; COST-LABEL: 'sel_v4f16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %tmp0
+;
+  %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x half> %tmp0
+}
+
+define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
+; COST-LABEL: 'sel_v8f16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %tmp0
+;
+  %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x half> %tmp0
+}
+
+define <2 x float> @sel_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; COST-LABEL: 'sel_v2f32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 3>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %tmp0
+;
   %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 3>
   ret <2 x float> %tmp0
 }
 
-; COST-LABEL: sel.v4f32
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CODE-LABEL: sel.v4f32
-; CODE:       rev64 v0.4s, v0.4s
-; CODE:       trn2 v0.4s, v0.4s, v1.4s
-define <4 x float> @sel.v4f32(<4 x float> %v0, <4 x float> %v1) {
+define <4 x float> @sel_v4f32(<4 x float> %v0, <4 x float> %v1) {
+; COST-LABEL: 'sel_v4f32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %tmp0
+;
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x float> %tmp0
 }
 
-; COST-LABEL: sel.v2f64
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 3>
-; CODE-LABEL: sel.v2f64
-; CODE:       mov v0.d[1], v1.d[1]
-define <2 x double> @sel.v2f64(<2 x double> %v0, <2 x double> %v1) {
+define <2 x double> @sel_v2f64(<2 x double> %v0, <2 x double> %v1) {
+; COST-LABEL: 'sel_v2f64'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 3>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %tmp0
+;
   %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %tmp0
 }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index a181567..7107d2b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -399,13 +399,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
@@ -436,13 +436,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
@@ -476,13 +476,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
@@ -513,13 +513,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
index 56f9e18..d1b230c 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
@@ -875,15 +875,6 @@ define void @icmp_sle() {
 
 define void @fcmp_oeq() {
 ; CHECK-LABEL: 'fcmp_oeq'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp oeq <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp oeq <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp oeq <8 x float> undef, undef
@@ -902,16 +893,7 @@ define void @fcmp_oeq() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp oeq <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp oeq <2 x half> undef, undef
-  %v4f16 = fcmp oeq <4 x half> undef, undef
-  %v8f16 = fcmp oeq <8 x half> undef, undef
-  %v16f16 = fcmp oeq <16 x half> undef, undef
 
-  %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp oeq <2 x float> undef, undef
   %v4f32 = fcmp oeq <4 x float> undef, undef
@@ -938,15 +920,6 @@ define void @fcmp_oeq() {
 
 define void @fcmp_one() {
 ; CHECK-LABEL: 'fcmp_one'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp one <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp one <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp one <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp one <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = fcmp one <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = fcmp one <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = fcmp one <8 x float> undef, undef
@@ -965,16 +938,7 @@ define void @fcmp_one() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv8f64 = fcmp one <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp one <2 x half> undef, undef
-  %v4f16 = fcmp one <4 x half> undef, undef
-  %v8f16 = fcmp one <8 x half> undef, undef
-  %v16f16 = fcmp one <16 x half> undef, undef
 
-  %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp one <2 x float> undef, undef
   %v4f32 = fcmp one <4 x float> undef, undef
@@ -1001,15 +965,6 @@ define void @fcmp_one() {
 
 define void @fcmp_olt() {
 ; CHECK-LABEL: 'fcmp_olt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef
@@ -1028,16 +983,7 @@ define void @fcmp_olt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp olt <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp olt <2 x half> undef, undef
-  %v4f16 = fcmp olt <4 x half> undef, undef
-  %v8f16 = fcmp olt <8 x half> undef, undef
-  %v16f16 = fcmp olt <16 x half> undef, undef
 
-  %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp olt <2 x float> undef, undef
   %v4f32 = fcmp olt <4 x float> undef, undef
@@ -1064,15 +1010,6 @@ define void @fcmp_olt() {
 
 define void @fcmp_ole() {
 ; CHECK-LABEL: 'fcmp_ole'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp ole <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp ole <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp ole <8 x float> undef, undef
@@ -1091,16 +1028,7 @@ define void @fcmp_ole() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp ole <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ole <2 x half> undef, undef
-  %v4f16 = fcmp ole <4 x half> undef, undef
-  %v8f16 = fcmp ole <8 x half> undef, undef
-  %v16f16 = fcmp ole <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ole <2 x float> undef, undef
   %v4f32 = fcmp ole <4 x float> undef, undef
@@ -1127,15 +1055,6 @@ define void @fcmp_ole() {
 
 define void @fcmp_ogt() {
 ; CHECK-LABEL: 'fcmp_ogt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp ogt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp ogt <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp ogt <8 x float> undef, undef
@@ -1154,16 +1073,7 @@ define void @fcmp_ogt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp ogt <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ogt <2 x half> undef, undef
-  %v4f16 = fcmp ogt <4 x half> undef, undef
-  %v8f16 = fcmp ogt <8 x half> undef, undef
-  %v16f16 = fcmp ogt <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ogt <2 x float> undef, undef
   %v4f32 = fcmp ogt <4 x float> undef, undef
@@ -1190,15 +1100,6 @@ define void @fcmp_ogt() {
 
 define void @fcmp_oge() {
 ; CHECK-LABEL: 'fcmp_oge'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp oge <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp oge <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp oge <8 x float> undef, undef
@@ -1217,16 +1118,7 @@ define void @fcmp_oge() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp oge <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp oge <2 x half> undef, undef
-  %v4f16 = fcmp oge <4 x half> undef, undef
-  %v8f16 = fcmp oge <8 x half> undef, undef
-  %v16f16 = fcmp oge <16 x half> undef, undef
 
-  %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp oge <2 x float> undef, undef
   %v4f32 = fcmp oge <4 x float> undef, undef
@@ -1253,15 +1145,6 @@ define void @fcmp_oge() {
 
 define void @fcmp_ueq() {
 ; CHECK-LABEL: 'fcmp_ueq'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = fcmp ueq <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = fcmp ueq <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = fcmp ueq <8 x float> undef, undef
@@ -1280,16 +1163,7 @@ define void @fcmp_ueq() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv8f64 = fcmp ueq <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ueq <2 x half> undef, undef
-  %v4f16 = fcmp ueq <4 x half> undef, undef
-  %v8f16 = fcmp ueq <8 x half> undef, undef
-  %v16f16 = fcmp ueq <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ueq <2 x float> undef, undef
   %v4f32 = fcmp ueq <4 x float> undef, undef
@@ -1316,15 +1190,6 @@ define void @fcmp_ueq() {
 
 define void @fcmp_une() {
 ; CHECK-LABEL: 'fcmp_une'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp une <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp une <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp une <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp une <8 x float> undef, undef
@@ -1343,16 +1208,7 @@ define void @fcmp_une() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp une <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp une <2 x half> undef, undef
-  %v4f16 = fcmp une <4 x half> undef, undef
-  %v8f16 = fcmp une <8 x half> undef, undef
-  %v16f16 = fcmp une <16 x half> undef, undef
 
-  %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp une <2 x float> undef, undef
   %v4f32 = fcmp une <4 x float> undef, undef
@@ -1379,15 +1235,6 @@ define void @fcmp_une() {
 
 define void @fcmp_ult() {
 ; CHECK-LABEL: 'fcmp_ult'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ult <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ult <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ult <8 x float> undef, undef
@@ -1406,16 +1253,7 @@ define void @fcmp_ult() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ult <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ult <2 x half> undef, undef
-  %v4f16 = fcmp ult <4 x half> undef, undef
-  %v8f16 = fcmp ult <8 x half> undef, undef
-  %v16f16 = fcmp ult <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ult <2 x float> undef, undef
   %v4f32 = fcmp ult <4 x float> undef, undef
@@ -1442,15 +1280,6 @@ define void @fcmp_ult() {
 
 define void @fcmp_ule() {
 ; CHECK-LABEL: 'fcmp_ule'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ule <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ule <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ule <8 x float> undef, undef
@@ -1469,16 +1298,7 @@ define void @fcmp_ule() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ule <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ule <2 x half> undef, undef
-  %v4f16 = fcmp ule <4 x half> undef, undef
-  %v8f16 = fcmp ule <8 x half> undef, undef
-  %v16f16 = fcmp ule <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ule <2 x float> undef, undef
   %v4f32 = fcmp ule <4 x float> undef, undef
@@ -1505,15 +1325,6 @@ define void @fcmp_ule() {
 
 define void @fcmp_ugt() {
 ; CHECK-LABEL: 'fcmp_ugt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ugt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ugt <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ugt <8 x float> undef, undef
@@ -1532,16 +1343,7 @@ define void @fcmp_ugt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ugt <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ugt <2 x half> undef, undef
-  %v4f16 = fcmp ugt <4 x half> undef, undef
-  %v8f16 = fcmp ugt <8 x half> undef, undef
-  %v16f16 = fcmp ugt <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ugt <2 x float> undef, undef
   %v4f32 = fcmp ugt <4 x float> undef, undef
@@ -1568,15 +1370,6 @@ define void @fcmp_ugt() {
 
 define void @fcmp_uge() {
 ; CHECK-LABEL: 'fcmp_uge'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp uge <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp uge <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp uge <8 x float> undef, undef
@@ -1595,16 +1388,7 @@ define void @fcmp_uge() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp uge <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp uge <2 x half> undef, undef
-  %v4f16 = fcmp uge <4 x half> undef, undef
-  %v8f16 = fcmp uge <8 x half> undef, undef
-  %v16f16 = fcmp uge <16 x half> undef, undef
 
-  %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp uge <2 x float> undef, undef
   %v4f32 = fcmp uge <4 x float> undef, undef
@@ -1631,15 +1415,6 @@ define void @fcmp_uge() {
 
 define void @fcmp_true() {
 ; CHECK-LABEL: 'fcmp_true'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp true <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp true <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = fcmp true <8 x float> undef, undef
@@ -1658,16 +1433,7 @@ define void @fcmp_true() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64 = fcmp true <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp true <2 x half> undef, undef
-  %v4f16 = fcmp true <4 x half> undef, undef
-  %v8f16 = fcmp true <8 x half> undef, undef
-  %v16f16 = fcmp true <16 x half> undef, undef
 
-  %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp true <2 x float> undef, undef
   %v4f32 = fcmp true <4 x float> undef, undef
@@ -1694,15 +1460,6 @@ define void @fcmp_true() {
 
 define void @fcmp_false() {
 ; CHECK-LABEL: 'fcmp_false'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp false <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp false <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = fcmp false <8 x float> undef, undef
@@ -1721,16 +1478,7 @@ define void @fcmp_false() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64 = fcmp false <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp false <2 x half> undef, undef
-  %v4f16 = fcmp false <4 x half> undef, undef
-  %v8f16 = fcmp false <8 x half> undef, undef
-  %v16f16 = fcmp false <16 x half> undef, undef
-
-  %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+
 
   %v2f32 = fcmp false <2 x float> undef, undef
   %v4f32 = fcmp false <4 x float> undef, undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll
new file mode 100644
index 0000000..8396e80
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll
@@ -0,0 +1,677 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=NOF16
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=VFH
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=VFHMIN
+
+define void @fcmp_oeq() {
+; NOF16-LABEL: 'fcmp_oeq'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_oeq'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_oeq'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp oeq <2 x half> undef, undef
+  %v4f16 = fcmp oeq <4 x half> undef, undef
+  %v8f16 = fcmp oeq <8 x half> undef, undef
+  %v16f16 = fcmp oeq <16 x half> undef, undef
+  %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_one() {
+; NOF16-LABEL: 'fcmp_one'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp one <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp one <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp one <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp one <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_one'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp one <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp one <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp one <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp one <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_one'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp one <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp one <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp one <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp one <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp one <2 x half> undef, undef
+  %v4f16 = fcmp one <4 x half> undef, undef
+  %v8f16 = fcmp one <8 x half> undef, undef
+  %v16f16 = fcmp one <16 x half> undef, undef
+  %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_olt() {
+; NOF16-LABEL: 'fcmp_olt'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_olt'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_olt'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp olt <2 x half> undef, undef
+  %v4f16 = fcmp olt <4 x half> undef, undef
+  %v8f16 = fcmp olt <8 x half> undef, undef
+  %v16f16 = fcmp olt <16 x half> undef, undef
+  %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ole() {
+; NOF16-LABEL: 'fcmp_ole'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ole'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ole'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ole <2 x half> undef, undef
+  %v4f16 = fcmp ole <4 x half> undef, undef
+  %v8f16 = fcmp ole <8 x half> undef, undef
+  %v16f16 = fcmp ole <16 x half> undef, undef
+  %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ogt() {
+; NOF16-LABEL: 'fcmp_ogt'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ogt'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ogt'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ogt <2 x half> undef, undef
+  %v4f16 = fcmp ogt <4 x half> undef, undef
+  %v8f16 = fcmp ogt <8 x half> undef, undef
+  %v16f16 = fcmp ogt <16 x half> undef, undef
+  %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_oge() {
+; NOF16-LABEL: 'fcmp_oge'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_oge'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_oge'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp oge <2 x half> undef, undef
+  %v4f16 = fcmp oge <4 x half> undef, undef
+  %v8f16 = fcmp oge <8 x half> undef, undef
+  %v16f16 = fcmp oge <16 x half> undef, undef
+  %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ueq() {
+; NOF16-LABEL: 'fcmp_ueq'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ueq'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ueq'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ueq <2 x half> undef, undef
+  %v4f16 = fcmp ueq <4 x half> undef, undef
+  %v8f16 = fcmp ueq <8 x half> undef, undef
+  %v16f16 = fcmp ueq <16 x half> undef, undef
+  %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_une() {
+; NOF16-LABEL: 'fcmp_une'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp une <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp une <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp une <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp une <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_une'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp une <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_une'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp une <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp une <2 x half> undef, undef
+  %v4f16 = fcmp une <4 x half> undef, undef
+  %v8f16 = fcmp une <8 x half> undef, undef
+  %v16f16 = fcmp une <16 x half> undef, undef
+  %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ult() {
+; NOF16-LABEL: 'fcmp_ult'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ult'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ult'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ult <2 x half> undef, undef
+  %v4f16 = fcmp ult <4 x half> undef, undef
+  %v8f16 = fcmp ult <8 x half> undef, undef
+  %v16f16 = fcmp ult <16 x half> undef, undef
+  %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ule() {
+; NOF16-LABEL: 'fcmp_ule'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ule'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ule'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ule <2 x half> undef, undef
+  %v4f16 = fcmp ule <4 x half> undef, undef
+  %v8f16 = fcmp ule <8 x half> undef, undef
+  %v16f16 = fcmp ule <16 x half> undef, undef
+  %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ugt() {
+; NOF16-LABEL: 'fcmp_ugt'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ugt'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ugt'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ugt <2 x half> undef, undef
+  %v4f16 = fcmp ugt <4 x half> undef, undef
+  %v8f16 = fcmp ugt <8 x half> undef, undef
+  %v16f16 = fcmp ugt <16 x half> undef, undef
+  %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_uge() {
+; NOF16-LABEL: 'fcmp_uge'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_uge'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_uge'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp uge <2 x half> undef, undef
+  %v4f16 = fcmp uge <4 x half> undef, undef
+  %v8f16 = fcmp uge <8 x half> undef, undef
+  %v16f16 = fcmp uge <16 x half> undef, undef
+  %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_true() {
+; NOF16-LABEL: 'fcmp_true'
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v2f16 = fcmp true <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v4f16 = fcmp true <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v8f16 = fcmp true <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v16f16 = fcmp true <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_true'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_true'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp true <2 x half> undef, undef
+  %v4f16 = fcmp true <4 x half> undef, undef
+  %v8f16 = fcmp true <8 x half> undef, undef
+  %v16f16 = fcmp true <16 x half> undef, undef
+  %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_false() {
+; NOF16-LABEL: 'fcmp_false'
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v2f16 = fcmp false <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v4f16 = fcmp false <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v8f16 = fcmp false <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v16f16 = fcmp false <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_false'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_false'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp false <2 x half> undef, undef
+  %v4f16 = fcmp false <4 x half> undef, undef
+  %v8f16 = fcmp false <8 x half> undef, undef
+  %v16f16 = fcmp false <16 x half> undef, undef
+  %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll
index 30bae7e..cada8ab 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll
@@ -734,7 +734,7 @@ define void @shuffle2() vscale_range(2,2) {
 define void @multipart() vscale_range(2,2) {
 ; RV32-LABEL: 'multipart'
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
@@ -743,18 +743,18 @@ define void @multipart() vscale_range(2,2) {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> <i32 2, i32 3, i32 0>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 4, i32 4>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'multipart'
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
@@ -763,18 +763,18 @@ define void @multipart() vscale_range(2,2) {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> <i32 2, i32 3, i32 0>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 4, i32 4>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SIZE-LABEL: 'multipart'
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
@@ -783,11 +783,11 @@ define void @multipart() vscale_range(2,2) {
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> <i32 2, i32 3, i32 0>
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 4, i32 4>
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll b/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
index 20d2736..033537d 100644
--- a/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
@@ -247,21 +247,9 @@ define <4 x float> @test_v4f32_2(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_v4f32_3(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: 'test_v4f32_3'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
-;
-; SSSE3-LABEL: 'test_v4f32_3'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
-;
-; SSE42-LABEL: 'test_v4f32_3'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
-;
-; AVX-LABEL: 'test_v4f32_3'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
+; CHECK-LABEL: 'test_v4f32_3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
 ;
   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   ret <4 x float> %1
@@ -294,13 +282,9 @@ define <4 x i64> @test_v4i64_2(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <4 x i64> @test_v4i64_3(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: 'test_v4i64_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
-;
-; AVX-LABEL: 'test_v4i64_3'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
+; CHECK-LABEL: 'test_v4i64_3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
 ;
   %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x i64> %1
@@ -333,13 +317,9 @@ define <4 x double> @test_v4f64_2(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @test_v4f64_3(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: 'test_v4f64_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
-;
-; AVX-LABEL: 'test_v4f64_3'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
+; CHECK-LABEL: 'test_v4f64_3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
 ;
   %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
   ret <4 x double> %1
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index c555459..d7cf8e6 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -926,8 +926,8 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 
 define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'pairwise_reduction8float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -939,8 +939,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8float'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -952,8 +952,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8float'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -991,8 +991,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8float'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
index 4e42351..11ee433 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -133,98 +133,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -234,14 +172,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -265,14 +203,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +301,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +315,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +332,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +346,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +400,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +431,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +532,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +540,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +567,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +575,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +602,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +610,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
index 61978ba..ac407cb 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -133,98 +133,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -234,14 +172,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -265,14 +203,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +301,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +315,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +332,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +346,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +400,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +431,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +532,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +540,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +567,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +575,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +602,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +610,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
index 94e3bc3..46d9e46 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -133,98 +133,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -234,14 +172,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -265,14 +203,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +301,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +315,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +332,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +346,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +400,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +431,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +532,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +540,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +567,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +575,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +602,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +610,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
index fe3e61d..bc3af51 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -133,98 +133,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -234,14 +172,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -265,14 +203,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -363,13 +301,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +315,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -394,13 +332,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +346,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -425,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -462,7 +400,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -493,7 +431,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -594,7 +532,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +540,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -629,7 +567,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +575,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -664,7 +602,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +610,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
index e66bce0..0173439 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
index 77f0072..d45c96c 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
index d01e011..0f8b32e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
index 7b882e3..c483d5d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll
index 3fa5458..330cbc0 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll
@@ -47,7 +47,7 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -86,7 +86,7 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -127,7 +127,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -174,7 +174,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -336,7 +336,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -345,7 +345,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -354,7 +354,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
index 39c935f..0215f65 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
@@ -4,9 +4,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
@@ -19,20 +19,20 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
@@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
@@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXf16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXf16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXf16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
@@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
@@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
@@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
@@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V2  = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
@@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; SSE-LABEL: 'test_upper_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
index 2a89924..b20986e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
@@ -4,9 +4,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
@@ -19,26 +19,26 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -50,26 +50,26 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
@@ -82,29 +82,29 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -118,29 +118,29 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXf16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXf16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXf16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
@@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
@@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
@@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
@@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V2  = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
@@ -483,29 +433,29 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; SSE-LABEL: 'test_upper_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
index 848e7b4..56d8cad 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
@@ -4,9 +4,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
@@ -19,20 +19,20 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
@@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
@@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXf16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXf16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXf16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
@@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
@@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
@@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
@@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V2  = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
@@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; SSE-LABEL: 'test_upper_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
index 4c6d1ccd..56f56c3c 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
@@ -4,9 +4,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
@@ -19,20 +19,20 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
@@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
@@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXf16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXf16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXf16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
@@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
@@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
@@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
@@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V2  = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
@@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; SSE-LABEL: 'test_upper_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
index f67d681..3183331 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
@@ -71,7 +71,7 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
index 8b02b82..4783f23 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
@@ -71,7 +71,7 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
index 65558da..86eabc7 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
@@ -71,7 +71,7 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
index b687df5..e0bf638 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
@@ -71,7 +71,7 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
index 027af62..b672df5 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
index f9f045f..fc8c0cd 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
index 76690af..b48b620 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
index 034ec0a..efa0f2e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
index 2524976..ee82e10 100644
--- a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
@@ -76,58 +76,58 @@ define i32 @insert_double(i32 %arg, double %val, <2 x double> %src128, <4 x doub
 define i32 @insert_float(i32 %arg, float %val, <2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
 ; SSE2-LABEL: 'insert_float'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'insert_float'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'insert_float'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
diff --git a/llvm/test/Analysis/Lint/abi-attrs.ll b/llvm/test/Analysis/Lint/abi-attrs.ll
new file mode 100644
index 0000000..5a3ece6
--- /dev/null
+++ b/llvm/test/Analysis/Lint/abi-attrs.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -passes=lint -disable-output 2>&1 | FileCheck %s
+
+declare void @fn_nothing_i8(i8 %x)
+declare void @fn_zeroext(i8 zeroext %x)
+declare void @fn_signext(i8 signext %x)
+declare void @fn_inreg(i8 inreg %x)
+
+declare void @fn_nothing_ptr(ptr %x)
+declare void @fn_byval(ptr byval(i8) %x)
+declare void @fn_byref(ptr byref(i8) %x)
+declare void @fn_inalloca(ptr inalloca(i8) %x)
+declare void @fn_preallocated(ptr preallocated(i8) %x)
+declare void @fn_sret(ptr sret(i8) %x)
+
+define void @caller_zeroext(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute zeroext not present on both function and call-site
+; CHECK:  call void @fn_zeroext(i8 %x)
+  call void @fn_zeroext(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute zeroext not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 zeroext %x)
+  call void @fn_nothing_i8(i8 zeroext %x)
+  ret void
+}
+
+define void @caller_signext(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute signext not present on both function and call-site
+; CHECK:  call void @fn_signext(i8 %x)
+  call void @fn_signext(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute signext not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 signext %x)
+  call void @fn_nothing_i8(i8 signext %x)
+  ret void
+}
+
+define void @caller_inreg(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute inreg not present on both function and call-site
+; CHECK:  call void @fn_inreg(i8 %x)
+  call void @fn_inreg(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute inreg not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 inreg %x)
+  call void @fn_nothing_i8(i8 inreg %x)
+  ret void
+}
+
+define void @caller_byval(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute byval not present on both function and call-site
+; CHECK:  call void @fn_byval(ptr %x)
+  call void @fn_byval(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute byval not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr byval(i8) %x)
+  call void @fn_nothing_ptr(ptr byval(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute byval does not have same argument for function and call-site
+; CHECK:  call void @fn_byval(ptr byval(i16) %x)
+  call void @fn_byval(ptr byval(i16) %x)
+  ret void
+}
+
+define void @caller_byref(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute byref not present on both function and call-site
+; CHECK:  call void @fn_byref(ptr %x)
+  call void @fn_byref(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute byref not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr byref(i8) %x)
+  call void @fn_nothing_ptr(ptr byref(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute byref does not have same argument for function and call-site
+; CHECK:  call void @fn_byref(ptr byref(i16) %x)
+  call void @fn_byref(ptr byref(i16) %x)
+  ret void
+}
+
+define void @caller_inalloca(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute inalloca not present on both function and call-site
+; CHECK:  call void @fn_inalloca(ptr %x)
+  call void @fn_inalloca(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute inalloca not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr inalloca(i8) %x)
+  call void @fn_nothing_ptr(ptr inalloca(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute inalloca does not have same argument for function and call-site
+; CHECK:  call void @fn_inalloca(ptr inalloca(i16) %x)
+  call void @fn_inalloca(ptr inalloca(i16) %x)
+  ret void
+}
+
+define void @caller_sret(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute sret not present on both function and call-site
+; CHECK:  call void @fn_sret(ptr %x)
+  call void @fn_sret(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute sret not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr sret(i8) %x)
+  call void @fn_nothing_ptr(ptr sret(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute sret does not have same argument for function and call-site
+; CHECK:  call void @fn_sret(ptr sret(i16) %x)
+  call void @fn_sret(ptr sret(i16) %x)
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
index 81fe96a..46dccf4 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
@@ -310,3 +310,29 @@ inner.header:
 exit:
   ret void
 }
+
+; Checks correct traversal for loops without a unique predecessor
+; outside the loop.
+define void @pr120615() {
+; CHECK-LABEL: pr120615
+; CHECK-NEXT:  Determining loop execution counts for: @pr120615
+; CHECK-NEXT:  Loop %header: backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+entry:
+  br label %header
+
+bb:
+  br label %header
+
+header:
+  %0 = phi i32 [ %1, %header ], [ 0, %bb ], [ 0, %entry ]
+  %1 = add i32 %0, 1
+  %icmp = icmp slt i32 %0, 0
+  br i1 %icmp, label %header, label %exit
+
+exit:
+  ret void
+
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
index 89d8c5a..14f33d7 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define i32 @daorder(i32 %n) {
+define ptx_kernel i32 @daorder(i32 %n) {
 ; CHECK-LABEL: for function 'daorder'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -43,6 +43,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @daorder, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
index 0ac1b5f5..cf8ffad 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
@@ -4,7 +4,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
-define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'no_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -27,7 +27,7 @@ merge:
 ; if (threadIdx.x < 5)    // divergent: data dependent
 ;   c = b;
 ; return c;               // c is divergent: sync dependent
-define i32 @sync(i32 %a, i32 %b) {
+define ptx_kernel i32 @sync(i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'sync'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -49,7 +49,7 @@ bb3:
 ; }
 ; // c here is divergent because it is sync dependent on threadIdx.x >= 5
 ; return c;
-define i32 @mixed(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @mixed(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'mixed'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
@@ -101,7 +101,7 @@ merge:
 ; return i == 10 ? 0 : 1; // i here is divergent
 ;
 ; The i defined in the loop is used outside.
-define i32 @loop() {
+define ptx_kernel i32 @loop() {
 ; CHECK-LABEL: for function 'loop'
 entry:
   %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
@@ -149,7 +149,7 @@ else:
 }
 
 ; Verifies sync-dependence is computed correctly in the absense of loops.
-define i32 @sync_no_loop(i32 %arg) {
+define ptx_kernel i32 @sync_no_loop(i32 %arg) {
 ; CHECK-LABEL: for function 'sync_no_loop'
 entry:
   %0 = add i32 %arg, 1
@@ -174,9 +174,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
 
-!nvvm.annotations = !{!0, !1, !2, !3, !4}
-!0 = !{ptr @no_diverge, !"kernel", i32 1}
-!1 = !{ptr @sync, !"kernel", i32 1}
-!2 = !{ptr @mixed, !"kernel", i32 1}
-!3 = !{ptr @loop, !"kernel", i32 1}
-!4 = !{ptr @sync_no_loop, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
index e319211..65512bf 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'hidden_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -27,6 +27,3 @@ merge:
 }
 
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @hidden_diverge, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
index cd729a9..e1ecc69 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
@@ -23,7 +23,7 @@ target triple = "nvptx64-nvidia-cuda"
 ;                             V
 ;                        if (i3 == 5) // divergent
 ; because sync dependent on (tid / i3).
-define i32 @unstructured_loop(i1 %entry_cond) {
+define ptx_kernel i32 @unstructured_loop(i1 %entry_cond) {
 ; CHECK-LABEL: for function 'unstructured_loop'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -59,5 +59,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @unstructured_loop, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll b/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll
new file mode 100644
index 0000000..52f12a6
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll
@@ -0,0 +1,388 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s
+
+; The LIT tests rely on i32, i16 and i8 being valid machine types.
+; The bounds checking tests require also i64 and i128.
+target datalayout = "n8:16:32:64:128"
+
+; This LIT test checks if TruncInstCombine pass correctly recognizes the
+; constraints from a signed min-max clamp. The clamp is a sequence of smin and
+; smax instructions limiting a variable into a range, smin <= x <= smax.
+;
+; Each LIT test (except the last ones) has two versions depending on the order
+; of smin and smax:
+; a) y = smax(smin(x, upper_limit), lower_limit)
+; b) y = smin(smax(x, lower_limit), upper_limit)
+;
+; The clamp is used in TruncInstCombine.cpp pass (as part of aggressive-instcombine)
+; to optimize extensions and truncations of lshr. This is what is tested here.
+; The pass also optimizes extensions and truncations of other binary operators,
+; but in such cases the smin-smax clamp may not be used.
+
+define i8 @test_0a(i16 %x) {
+; CHECK-LABEL: define i8 @test_0a(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_0b(i16 %x) {
+; CHECK-LABEL: define i8 @test_0b(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 0)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; The following two tests contain add instead of lshr.
+; The optimization works here as well.
+define i8 @test_1a(i16 %x) {
+; CHECK-LABEL: define i8 @test_1a(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = add i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %a = sext i16 %2 to i32
+  %b = add i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_1b(i16 %x) {
+; CHECK-LABEL: define i8 @test_1b(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = add i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 0)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31)
+  %a = sext i16 %2 to i32
+  %b = add i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Tests for clamping with negative min and max.
+
+; With sext no optimization occurs.
+define i8 @test_2a(i16 %x) {
+; CHECK-LABEL: define i8 @test_2a(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 -1)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
+; CHECK-NEXT:    [[A:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 -1)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_2b(i16 %x) {
+; CHECK-LABEL: define i8 @test_2b(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 -1)
+; CHECK-NEXT:    [[A:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 -1)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; With zext the optimization occurs.
+define i8 @test_2c(i16 %x) {
+; CHECK-LABEL: define i8 @test_2c(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 -1)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 -1)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_2d(i16 %x) {
+; CHECK-LABEL: define i8 @test_2d(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 -1)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 -1)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Tests for clamping with mixed-signed min and max.
+; With zext the optimization occurs.
+define i8 @test_3a(i16 %x) {
+; CHECK-LABEL: define i8 @test_3a(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_3b(i16 %x) {
+; CHECK-LABEL: define i8 @test_3b(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Optimizations with vector types.
+define <16 x i8> @test_vec_1a(<16 x i16> %x) {
+; CHECK-LABEL: define <16 x i8> @test_vec_1a(
+; CHECK-SAME: <16 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[X]], <16 x i16> splat (i16 127))
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> zeroinitializer)
+; CHECK-NEXT:    [[A:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[B:%.*]] = lshr <16 x i8> [[A]], splat (i8 2)
+; CHECK-NEXT:    ret <16 x i8> [[B]]
+;
+  %1 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %x, <16 x i16> splat (i16 127))
+  %2 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %1, <16 x i16> zeroinitializer)
+  %a = sext <16 x i16> %2 to <16 x i32>
+  %b = lshr <16 x i32> %a, splat (i32 2)
+  %b.trunc = trunc <16 x i32> %b to <16 x i8>
+  ret <16 x i8> %b.trunc
+}
+
+define <16 x i8> @test_vec_1b(<16 x i16> %x) {
+; CHECK-LABEL: define <16 x i8> @test_vec_1b(
+; CHECK-SAME: <16 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[X]], <16 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> splat (i16 127))
+; CHECK-NEXT:    [[A:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[B:%.*]] = lshr <16 x i8> [[A]], splat (i8 2)
+; CHECK-NEXT:    ret <16 x i8> [[B]]
+;
+  %1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %x, <16 x i16> zeroinitializer)
+  %2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 127))
+  %a = sext <16 x i16> %2 to <16 x i32>
+  %b = lshr <16 x i32> %a, splat (i32 2)
+  %b.trunc = trunc <16 x i32> %b to <16 x i8>
+  ret <16 x i8> %b.trunc
+}
+
+; A longer test that was the original motivation for the smin-smax clamping.
+define i8 @test_final(i16 %x, i16 %y) {
+; CHECK-LABEL: define i8 @test_final(
+; CHECK-SAME: i16 [[X:%.*]], i16 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 127)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.smax.i16(i16 [[Y]], i16 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 127)
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[MUL]], 7
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[SHR]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 127)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %x.clamp = zext nneg i16 %2 to i32
+  %3 = tail call i16 @llvm.smax.i16(i16 %y, i16 0)
+  %4 = tail call i16 @llvm.smin.i16(i16 %3, i16 127)
+  %y.clamp = zext nneg i16 %4 to i32
+  %mul = mul nuw nsw i32 %x.clamp, %y.clamp
+  %shr = lshr i32 %mul, 7
+  %trunc= trunc nuw nsw i32 %shr to i8
+  ret i8 %trunc
+}
+
+; Range tests below check if the bounds are dealt with correctly.
+
+; This gets optimized.
+define i8 @test_bounds_1(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_1(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 127)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 7
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 127)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 7
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; While this does not.
+define i8 @test_bounds_2(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_2(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 128)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 7
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 128)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 7
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; This should get optimized. We test here if the optimization works correctly
+; if the upper limit is signed max int.
+define i8 @test_bounds_3(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_3(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 32767)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 32752)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 32767)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 32752)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Here min = 128 is greater than max = 0.
+define i8 @test_bounds_4(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_4(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 128)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 0)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 128)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; The following 3 tests check the situation where min and max are minimal and
+; maximal signed values. No transformations should occur here.
+define i8 @test_bounds_5(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_5(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 32767)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -32768)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 32767)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -32768)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_bounds_6(i32 %x) {
+; CHECK-LABEL: define i8 @test_bounds_6(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smin.i32(i32 [[X]], i32 2147483647)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP1]], i32 -2147483648)
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i32 @llvm.smin.i32(i32 %x, i32 2147483647)
+  %2 = tail call i32 @llvm.smax.i32(i32 %1, i32 -2147483648)
+  %a = zext i32 %2 to i64
+  %b = lshr i64 %a, 2
+  %b.trunc = trunc i64 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_bounds_7(i64 %x) {
+; CHECK-LABEL: define i8 @test_bounds_7(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.smin.i64(i64 [[X]], i64 9223372036854775807)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 -9223372036854775808)
+; CHECK-NEXT:    [[B:%.*]] = lshr i64 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i64 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i64 @llvm.smin.i64(i64 %x, i64 9223372036854775807)
+  %2 = tail call i64 @llvm.smax.i64(i64 %1, i64 -9223372036854775808)
+  %a = zext i64 %2 to i128
+  %b = lshr i128 %a, 2
+  %b.trunc = trunc i128 %b to i8
+  ret i8 %b.trunc
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 86fa12a..4afa0d4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -436,6 +436,36 @@ body:             |
     $w0 = COPY %ext(s32)
 ...
 ---
+# select cond, 0, 64 --> (zext (!Cond)) << log2(Pow2)
+name:            select_cond_0_64_to_shift
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_64_to_shift
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT [[XOR]](s1)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
+    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[ZEXT]], [[C1]](s8)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 0
+    %one:_(s8) = G_CONSTANT i8 64
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
 # select cond, -1, 0 --> sext Cond
 name:            select_cond_minus_1_0_to_sext_cond
 body:             |
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
index e3a633c..4d461c9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
@@ -48,3 +48,73 @@ body:             |
     G_BR %bb.2
 
 ...
+---
+name:            boolean_vector_to_scalar
+tracksRegLiveness: true
+body:             |
+    bb.1:
+    ; CHECK-LABEL: name: boolean_vector_to_scalar
+    ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C1]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C1]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C1]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C8]](s64)
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C1]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C9]](s64)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C1]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s8), [[FRAME_INDEX]](p0) :: (store (s8) into %stack.0)
+    ; CHECK-NEXT: %bc:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s8) from %stack.0)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %bc(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+      %vec:_(<8 x s1>) = G_IMPLICIT_DEF
+      %bc:_(s8) = G_BITCAST %vec(<8 x s1>)
+      %ext:_(s32) = G_ANYEXT %bc(s8)
+      $w0 = COPY %ext(s32)
+      RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
new file mode 100644
index 0000000..b529577
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple aarch64 -O0 -global-isel -o - %s | FileCheck %s
+
+define <1 x i1> @shuffle_extract_4(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    umov w8, v0.h[4]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 4>
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i1> @shuffle_extract_12(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_12:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.8h, v1.8b, #0
+; CHECK-NEXT:    umov w8, v0.h[4]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 12>
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i1> @shuffle_extract_p(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_p:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // implicit-def: $w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 poison>
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i32> @shufflevector_v1i32(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: shufflevector_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+    %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 1>
+    ret <1 x i32> %c
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index 2464026..af03a21 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -618,3 +618,32 @@ body:             |
     RET_ReallyLR implicit $q0
 
 ...
+---
+name:            shuffle_v8i1_v1i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+    ; CHECK-LABEL: name: shuffle_v8i1_v1i8
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[ANYEXT]](<8 x s16>), [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %2:_(<8 x s8>) = COPY $d0
+    %0:_(<8 x s1>) = G_TRUNC %2:_(<8 x s8>)
+    %3:_(<8 x s8>) = COPY $d1
+    %1:_(<8 x s1>) = G_TRUNC %3:_(<8 x s8>)
+    %4:_(s1) = G_SHUFFLE_VECTOR %0:_(<8 x s1>), %1:_, shufflemask(12)
+    %5:_(s8) = G_ZEXT %4:_(s1)
+    %6:_(s32) = G_ANYEXT %5:_(s8)
+    $w0 = COPY %6:_(s32)
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
new file mode 100644
index 0000000..1df6297
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s
+---
+name:            store_8xs1
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $q0, $q1, $x0
+    ; CHECK-LABEL: name: store_8xs1
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY]](<4 x s32>), [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY1]](<4 x s32>), [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C1]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C2]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C2]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C8]](s64)
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C2]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C9]](s64)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C2]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC3]](s8), %ptr(p0) :: (store (s8))
+    ; CHECK-NEXT: RET_ReallyLR
+    %1:_(<4 x s32>) = COPY $q0
+    %2:_(<4 x s32>) = COPY $q1
+    %ptr:_(p0) = COPY $x0
+    %0:_(<8 x s32>) = G_CONCAT_VECTORS %1(<4 x s32>), %2(<4 x s32>)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<8 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32)
+    %5:_(<8 x s1>) = G_ICMP intpred(slt), %0(<8 x s32>), %3
+    G_STORE %5(<8 x s1>), %ptr(p0) :: (store (<8 x s1>))
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index ee9fff7..f0c9dcc 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -440,11 +440,10 @@ define <8 x i16> @shufsext_v8i8_v8i16(<8 x i8> %src, <8 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufsext_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI14_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -493,11 +492,10 @@ define <8 x i16> @shufzext_v8i8_v8i16(<8 x i8> %src, <8 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufzext_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
index 1aa28f5..9a1203f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
@@ -156,11 +156,10 @@ define i32 @fptosi_bf(bfloat %a) nounwind ssp {
 ; CHECK-LABEL: fptosi_bf:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s1, s0
-; CHECK-NEXT:    // implicit-def: $s0
+; CHECK-NEXT:    // implicit-def: $d0
 ; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -173,11 +172,10 @@ define i32 @fptoui_sbf(bfloat %a) nounwind ssp {
 ; CHECK-LABEL: fptoui_sbf:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s1, s0
-; CHECK-NEXT:    // implicit-def: $s0
+; CHECK-NEXT:    // implicit-def: $d0
 ; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcvtzu w0, s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index ed9c1b0..fb40dfc 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -182,17 +182,14 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fadd s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
@@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fadd s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 888b795..818dcf3 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -184,17 +184,14 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
@@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
@@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT:    mov h1, v0.h[1]
-; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    dup v1.4h, v0.h[1]
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmov w9, s1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s0, w9
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
+; NOLSE-NEXT:    shll v1.4s, v1.4h, #16
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    mov h3, v2.h[1]
-; NOLSE-NEXT:    fmov w11, s2
-; NOLSE-NEXT:    lsl w11, w11, #16
-; NOLSE-NEXT:    fmov w10, s3
-; NOLSE-NEXT:    fmov s3, w11
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmaxnm s3, s3, s1
-; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    dup v3.4h, v2.h[1]
+; NOLSE-NEXT:    shll v2.4s, v2.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s0
-; NOLSE-NEXT:    fmov w11, s3
+; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
+; NOLSE-NEXT:    fmaxnm s3, s3, s1
+; NOLSE-NEXT:    fmov w11, s2
 ; NOLSE-NEXT:    ubfx w13, w11, #16, #1
 ; NOLSE-NEXT:    add w11, w11, w8
-; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:    fmov w10, s3
 ; NOLSE-NEXT:    add w11, w13, w11
 ; NOLSE-NEXT:    lsr w11, w11, #16
 ; NOLSE-NEXT:    ubfx w12, w10, #16, #1
@@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT:    mov h1, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    dup v1.4h, v0.h[1]
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr s0, [x0]
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s1
-; LSE-NEXT:    fmov s2, w10
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:    shll v1.4s, v1.4h, #16
 ; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    mov h3, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s3
-; LSE-NEXT:    fmov s4, w10
-; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    dup v3.4h, v0.h[1]
+; LSE-NEXT:    shll v4.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s4, s4, s2
-; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    shll v3.4s, v3.4h, #16
 ; LSE-NEXT:    fmaxnm s3, s3, s1
 ; LSE-NEXT:    fmov w10, s4
 ; LSE-NEXT:    ubfx w12, w10, #16, #1
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index a3665c6..b969241e 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -184,17 +184,14 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
@@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
@@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT:    mov h1, v0.h[1]
-; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    dup v1.4h, v0.h[1]
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmov w9, s1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s0, w9
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
+; NOLSE-NEXT:    shll v1.4s, v1.4h, #16
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    mov h3, v2.h[1]
-; NOLSE-NEXT:    fmov w11, s2
-; NOLSE-NEXT:    lsl w11, w11, #16
-; NOLSE-NEXT:    fmov w10, s3
-; NOLSE-NEXT:    fmov s3, w11
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fminnm s3, s3, s1
-; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    dup v3.4h, v2.h[1]
+; NOLSE-NEXT:    shll v2.4s, v2.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s0
-; NOLSE-NEXT:    fmov w11, s3
+; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
+; NOLSE-NEXT:    fminnm s3, s3, s1
+; NOLSE-NEXT:    fmov w11, s2
 ; NOLSE-NEXT:    ubfx w13, w11, #16, #1
 ; NOLSE-NEXT:    add w11, w11, w8
-; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:    fmov w10, s3
 ; NOLSE-NEXT:    add w11, w13, w11
 ; NOLSE-NEXT:    lsr w11, w11, #16
 ; NOLSE-NEXT:    ubfx w12, w10, #16, #1
@@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT:    mov h1, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    dup v1.4h, v0.h[1]
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr s0, [x0]
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s1
-; LSE-NEXT:    fmov s2, w10
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:    shll v1.4s, v1.4h, #16
 ; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    mov h3, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s3
-; LSE-NEXT:    fmov s4, w10
-; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    dup v3.4h, v0.h[1]
+; LSE-NEXT:    shll v4.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s4, s4, s2
-; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    shll v3.4s, v3.4h, #16
 ; LSE-NEXT:    fminnm s3, s3, s1
 ; LSE-NEXT:    fmov w10, s4
 ; LSE-NEXT:    ubfx w12, w10, #16, #1
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
index 7725ce0..e603337 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -182,17 +182,14 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fsub s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fsub s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
@@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fsub s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fsub s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
index 3399761..bc06453 100644
--- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -5,16 +5,12 @@
 define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fadd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fadd s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -26,15 +22,11 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fadd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fadd s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fadd bfloat %a, %b
@@ -44,16 +36,12 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fsub:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fsub s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fsub s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -65,15 +53,11 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fsub:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fsub s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsub s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fsub bfloat %a, %b
@@ -83,16 +67,12 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fmul:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -104,15 +84,11 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmul:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fmul bfloat %a, %b
@@ -122,27 +98,21 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fmadd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
 ; CHECK-CVT-NEXT:    add w8, w9, w8
-; CHECK-CVT-NEXT:    fmov w9, s2
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
@@ -155,23 +125,15 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmadd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov w9, s2
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -183,16 +145,12 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fdiv:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fdiv s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fdiv s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -204,15 +162,11 @@ define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fdiv:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fdiv s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fdiv s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fdiv bfloat %a, %b
@@ -223,14 +177,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_frem:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl fmodf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -246,14 +198,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_frem:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl fmodf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -334,17 +284,13 @@ define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 {
 define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 ; CHECK-LABEL: test_select_cc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $d2
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    fcmp s2, s3
 ; CHECK-NEXT:    fcsel s0, s0, s1, ne
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
@@ -356,15 +302,11 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 {
 ; CHECK-LABEL: test_select_cc_f32_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    fcmp s2, s3
 ; CHECK-NEXT:    fcsel s0, s0, s1, ne
 ; CHECK-NEXT:    ret
   %cc = fcmp une bfloat %c, %d
@@ -389,15 +331,11 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d)
 define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %r = fcmp une bfloat %a, %b
@@ -407,15 +345,11 @@ define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ueq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w8, eq
 ; CHECK-NEXT:    csinc w0, w8, wzr, vc
 ; CHECK-NEXT:    ret
@@ -426,15 +360,11 @@ define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ugt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
   %r = fcmp ugt bfloat %a, %b
@@ -444,15 +374,11 @@ define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_uge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, pl
 ; CHECK-NEXT:    ret
   %r = fcmp uge bfloat %a, %b
@@ -462,15 +388,11 @@ define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ult:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, lt
 ; CHECK-NEXT:    ret
   %r = fcmp ult bfloat %a, %b
@@ -480,15 +402,11 @@ define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ule:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, le
 ; CHECK-NEXT:    ret
   %r = fcmp ule bfloat %a, %b
@@ -498,15 +416,11 @@ define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_uno:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, vs
 ; CHECK-NEXT:    ret
   %r = fcmp uno bfloat %a, %b
@@ -516,15 +430,11 @@ define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w8, mi
 ; CHECK-NEXT:    csinc w0, w8, wzr, le
 ; CHECK-NEXT:    ret
@@ -535,15 +445,11 @@ define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_oeq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %r = fcmp oeq bfloat %a, %b
@@ -553,15 +459,11 @@ define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ogt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
   %r = fcmp ogt bfloat %a, %b
@@ -571,15 +473,11 @@ define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_oge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ge
 ; CHECK-NEXT:    ret
   %r = fcmp oge bfloat %a, %b
@@ -589,15 +487,11 @@ define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_olt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, mi
 ; CHECK-NEXT:    ret
   %r = fcmp olt bfloat %a, %b
@@ -607,15 +501,11 @@ define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ole:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ls
 ; CHECK-NEXT:    ret
   %r = fcmp ole bfloat %a, %b
@@ -625,15 +515,11 @@ define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ord:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, vc
 ; CHECK-NEXT:    ret
   %r = fcmp ord bfloat %a, %b
@@ -643,13 +529,11 @@ define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
 define void @test_fccmp(bfloat %in, ptr %out) {
 ; CHECK-LABEL: test_fccmp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    movi v1.2s, #69, lsl #24
-; CHECK-NEXT:    movi v3.2s, #72, lsl #24
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v2.4s, v0.4h, #16
 ; CHECK-NEXT:    adrp x8, .LCPI29_0
+; CHECK-NEXT:    movi v3.2s, #72, lsl #24
 ; CHECK-NEXT:    fcmp s2, s1
 ; CHECK-NEXT:    ldr h1, [x8, :lo12:.LCPI29_0]
 ; CHECK-NEXT:    fccmp s2, s3, #4, mi
@@ -667,15 +551,11 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 {
 ; CHECK-LABEL: test_br_cc:
 ; CHECK:       // %bb.0: // %common.ret
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    csel x8, x0, x1, pl
 ; CHECK-NEXT:    str wzr, [x8]
 ; CHECK-NEXT:    ret
@@ -725,10 +605,8 @@ declare i1 @test_dummy(ptr %p1) #0
 define i32 @test_fptosi_i32(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptosi_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
   %r = fptosi bfloat %a to i32
@@ -738,10 +616,8 @@ define i32 @test_fptosi_i32(bfloat %a) #0 {
 define i64 @test_fptosi_i64(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptosi_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
   %r = fptosi bfloat %a to i64
@@ -751,10 +627,8 @@ define i64 @test_fptosi_i64(bfloat %a) #0 {
 define i32 @test_fptoui_i32(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptoui_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w0, s0
 ; CHECK-NEXT:    ret
   %r = fptoui bfloat %a to i32
@@ -764,10 +638,8 @@ define i32 @test_fptoui_i32(bfloat %a) #0 {
 define i64 @test_fptoui_i64(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptoui_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu x0, s0
 ; CHECK-NEXT:    ret
   %r = fptoui bfloat %a to i64
@@ -927,7 +799,8 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    ucvtf d1, w0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fcvtxn s1, d1
 ; CHECK-CVT-NEXT:    fmov w9, s1
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -935,12 +808,7 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    add w9, w10, w9
 ; CHECK-CVT-NEXT:    lsr w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov w9, s0
-; CHECK-CVT-NEXT:    fmov w10, s1
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -954,15 +822,11 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_uitofp_i32_fadd:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    ucvtf d1, w0
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fcvtxn s1, d1
-; CHECK-BF16-NEXT:    fmov s0, w8
 ; CHECK-BF16-NEXT:    bfcvt h1, s1
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -976,7 +840,8 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    scvtf d1, w0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fcvtxn s1, d1
 ; CHECK-CVT-NEXT:    fmov w9, s1
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -984,12 +849,7 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    add w9, w10, w9
 ; CHECK-CVT-NEXT:    lsr w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov w9, s0
-; CHECK-CVT-NEXT:    fmov w10, s1
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -1003,15 +863,11 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_sitofp_i32_fadd:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    scvtf d1, w0
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fcvtxn s1, d1
-; CHECK-BF16-NEXT:    fmov s0, w8
 ; CHECK-BF16-NEXT:    bfcvt h1, s1
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1070,10 +926,9 @@ define bfloat @test_fptrunc_double(double %a) #0 {
 define float @test_fpext_float(bfloat %a) #0 {
 ; CHECK-LABEL: test_fpext_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
   %r = fpext bfloat %a to float
   ret float %r
@@ -1082,10 +937,8 @@ define float @test_fpext_float(bfloat %a) #0 {
 define double @test_fpext_double(bfloat %a) #0 {
 ; CHECK-LABEL: test_fpext_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvt d0, s0
 ; CHECK-NEXT:    ret
   %r = fpext bfloat %a to double
@@ -1148,11 +1001,9 @@ declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0
 define bfloat @test_sqrt(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sqrt:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fsqrt s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -1165,10 +1016,8 @@ define bfloat @test_sqrt(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_sqrt:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fsqrt s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1180,10 +1029,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 {
 ; CHECK-CVT-LABEL: test_powi:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl __powisf2
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1199,10 +1047,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 {
 ; CHECK-BF16-LABEL: test_powi:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl __powisf2
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1216,10 +1063,9 @@ define bfloat @test_sin(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sin:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl sinf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1235,10 +1081,9 @@ define bfloat @test_sin(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_sin:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl sinf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1251,10 +1096,9 @@ define bfloat @test_cos(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_cos:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl cosf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1270,10 +1114,9 @@ define bfloat @test_cos(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_cos:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl cosf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1286,10 +1129,9 @@ define bfloat @test_tan(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_tan:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl tanf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1305,10 +1147,9 @@ define bfloat @test_tan(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_tan:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl tanf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1321,10 +1162,9 @@ define bfloat @test_acos(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_acos:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl acosf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1340,10 +1180,9 @@ define bfloat @test_acos(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_acos:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl acosf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1356,10 +1195,9 @@ define bfloat @test_asin(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_asin:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl asinf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1375,10 +1213,9 @@ define bfloat @test_asin(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_asin:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl asinf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1391,10 +1228,9 @@ define bfloat @test_atan(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_atan:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl atanf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1410,10 +1246,9 @@ define bfloat @test_atan(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_atan:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl atanf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1426,14 +1261,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_atan2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl atan2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1449,14 +1282,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_atan2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl atan2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1469,10 +1300,9 @@ define bfloat @test_cosh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_cosh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl coshf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1488,10 +1318,9 @@ define bfloat @test_cosh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_cosh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl coshf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1504,10 +1333,9 @@ define bfloat @test_sinh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sinh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl sinhf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1523,10 +1351,9 @@ define bfloat @test_sinh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_sinh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl sinhf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1539,10 +1366,9 @@ define bfloat @test_tanh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_tanh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl tanhf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1558,10 +1384,9 @@ define bfloat @test_tanh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_tanh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl tanhf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1574,14 +1399,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_pow:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl powf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1597,14 +1420,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_pow:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl powf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1617,10 +1438,9 @@ define bfloat @test_exp(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_exp:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl expf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1636,10 +1456,9 @@ define bfloat @test_exp(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_exp:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl expf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1652,10 +1471,9 @@ define bfloat @test_exp2(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_exp2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl exp2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1671,10 +1489,9 @@ define bfloat @test_exp2(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_exp2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl exp2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1687,10 +1504,9 @@ define bfloat @test_log(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl logf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1706,10 +1522,9 @@ define bfloat @test_log(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl logf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1722,10 +1537,9 @@ define bfloat @test_log10(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log10:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl log10f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1741,10 +1555,9 @@ define bfloat @test_log10(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log10:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl log10f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1757,10 +1570,9 @@ define bfloat @test_log2(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl log2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1776,10 +1588,9 @@ define bfloat @test_log2(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl log2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1791,20 +1602,14 @@ define bfloat @test_log2(bfloat %a) #0 {
 define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fma:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s2
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-CVT-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmadd s0, s0, s1, s2
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
@@ -1816,19 +1621,13 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fma:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s2
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    fmov w10, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    lsl w10, w10, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov s2, w10
-; CHECK-BF16-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmadd s0, s0, s1, s2
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
@@ -1851,16 +1650,12 @@ define bfloat @test_fabs(bfloat %a) #0 {
 define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_minnum:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fminnm s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fminnm s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -1872,15 +1667,11 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_minnum:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fminnm s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fminnm s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
@@ -1890,16 +1681,12 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_maxnum:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fmaxnm s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -1911,15 +1698,11 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_maxnum:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmaxnm s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
@@ -1929,16 +1712,12 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
@@ -1947,16 +1726,12 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
@@ -1966,12 +1741,10 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_f32:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
@@ -1981,12 +1754,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign_f32:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1998,12 +1769,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_f64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    fcvt s1, d1
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
@@ -2013,12 +1782,10 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign_f64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    fcvt s1, d1
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2032,34 +1799,33 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_extended:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    movi v2.4s, #16
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: test_copysign_extended:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    movi v2.4s, #16
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
@@ -2070,11 +1836,9 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
 define bfloat @test_floor(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_floor:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintm s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2087,10 +1851,8 @@ define bfloat @test_floor(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_floor:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintm s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2101,11 +1863,9 @@ define bfloat @test_floor(bfloat %a) #0 {
 define bfloat @test_ceil(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_ceil:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintp s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2118,10 +1878,8 @@ define bfloat @test_ceil(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_ceil:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintp s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2132,11 +1890,9 @@ define bfloat @test_ceil(bfloat %a) #0 {
 define bfloat @test_trunc(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_trunc:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintz s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2149,10 +1905,8 @@ define bfloat @test_trunc(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_trunc:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintz s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2163,11 +1917,9 @@ define bfloat @test_trunc(bfloat %a) #0 {
 define bfloat @test_rint(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_rint:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintx s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2180,10 +1932,8 @@ define bfloat @test_rint(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_rint:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintx s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2194,11 +1944,9 @@ define bfloat @test_rint(bfloat %a) #0 {
 define bfloat @test_nearbyint(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_nearbyint:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frinti s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2211,10 +1959,8 @@ define bfloat @test_nearbyint(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_nearbyint:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frinti s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2225,11 +1971,9 @@ define bfloat @test_nearbyint(bfloat %a) #0 {
 define bfloat @test_round(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_round:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frinta s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2242,10 +1986,8 @@ define bfloat @test_round(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_round:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frinta s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2256,11 +1998,9 @@ define bfloat @test_round(bfloat %a) #0 {
 define bfloat @test_roundeven(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_roundeven:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintn s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2273,10 +2013,8 @@ define bfloat @test_roundeven(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_roundeven:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintn s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2287,27 +2025,21 @@ define bfloat @test_roundeven(bfloat %a) #0 {
 define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fmuladd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
 ; CHECK-CVT-NEXT:    add w8, w9, w8
-; CHECK-CVT-NEXT:    fmov w9, s2
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
@@ -2320,23 +2052,15 @@ define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmuladd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov w9, s2
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index c03e2e5..a609e33 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -272,9 +272,8 @@ define <8 x bfloat> @d_to_h(<8 x double> %a) {
 define <8 x float> @h_to_s(<8 x bfloat> %a) {
 ; CHECK-LABEL: h_to_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    ret
   %1 = fpext <8 x bfloat> %a to <8 x float>
   ret <8 x float> %1
@@ -283,13 +282,12 @@ define <8 x float> @h_to_s(<8 x bfloat> %a) {
 define <8 x double> @h_to_d(<8 x bfloat> %a) {
 ; CHECK-LABEL: h_to_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    shll v2.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtl v0.2d, v2.2s
-; CHECK-NEXT:    shll v4.4s, v1.4h, #16
-; CHECK-NEXT:    fcvtl2 v1.2d, v2.4s
-; CHECK-NEXT:    fcvtl2 v3.2d, v4.4s
-; CHECK-NEXT:    fcvtl v2.2d, v4.2s
+; CHECK-NEXT:    shll v1.4s, v0.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v0.8h, #16
+; CHECK-NEXT:    fcvtl v0.2d, v1.2s
+; CHECK-NEXT:    fcvtl2 v3.2d, v2.4s
+; CHECK-NEXT:    fcvtl2 v1.2d, v1.4s
+; CHECK-NEXT:    fcvtl v2.2d, v2.2s
 ; CHECK-NEXT:    ret
   %1 = fpext <8 x bfloat> %a to <8 x double>
   ret <8 x double> %1
@@ -788,11 +786,10 @@ define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
 define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptosi_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
@@ -803,11 +800,10 @@ define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
 define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptosi_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %1 = fptosi<8 x bfloat> %a to <8 x i16>
@@ -817,11 +813,10 @@ define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
 define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptoui_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
@@ -832,11 +827,10 @@ define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
 define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptoui_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %1 = fptoui<8 x bfloat> %a to <8 x i16>
@@ -846,90 +840,58 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
 define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ne
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ne
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ne
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ne
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -941,96 +903,64 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ueq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    lsl w9, w11, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s7, w9
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csinv w10, w10, wzr, vc
-; CHECK-NEXT:    fcmp s7, s6
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h4, v0.h[4]
-; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
 ; CHECK-NEXT:    csetm w9, eq
 ; CHECK-NEXT:    csinv w9, w9, wzr, vc
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    lsl w11, w11, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s5, w11
-; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    mov v2.h[1], w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov s6, w10
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    fmov w10, s5
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    lsl w8, w9, #16
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    csinv w10, w10, wzr, vc
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
@@ -1044,90 +974,58 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ugt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, hi
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, hi
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, hi
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, hi
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1139,90 +1037,58 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_uge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, pl
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, pl
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, pl
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, pl
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1234,90 +1100,58 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ult:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, lt
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, lt
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, lt
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, lt
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1329,90 +1163,58 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ule:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, le
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, le
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, le
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, le
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1424,90 +1226,58 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_uno:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, vs
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, vs
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, vs
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, vs
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1519,96 +1289,64 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    lsl w9, w11, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s7, w9
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csinv w10, w10, wzr, le
-; CHECK-NEXT:    fcmp s7, s6
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h4, v0.h[4]
-; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
 ; CHECK-NEXT:    csetm w9, mi
 ; CHECK-NEXT:    csinv w9, w9, wzr, le
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    lsl w11, w11, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s5, w11
-; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    mov v2.h[1], w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov s6, w10
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    fmov w10, s5
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    lsl w8, w9, #16
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    csinv w10, w10, wzr, le
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
@@ -1622,90 +1360,58 @@ define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_oeq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, eq
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, eq
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1717,90 +1423,58 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ogt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, gt
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, gt
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, gt
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, gt
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1812,90 +1486,58 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_oge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ge
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ge
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ge
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ge
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1907,90 +1549,58 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_olt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, mi
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, mi
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -2002,90 +1612,58 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ole:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ls
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ls
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ls
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ls
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -2097,90 +1675,58 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ord:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, vc
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, vc
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, vc
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, vc
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll
new file mode 100644
index 0000000..1163314
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll
@@ -0,0 +1,37 @@
+; This test verifies that two similar functions, f1 and f2, are not merged
+; when their attached call targets differ, since these targets cannot be parameterized.
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true < %s | FileCheck %s
+
+; CHECK-NOT: _f1.Tgm
+; CHECK-NOT: _f2.Tgm
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+define i64 @f1(ptr %0) {
+  %2 = call ptr @g1(ptr %0, i32 0) minsize [ "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ]
+  tail call void (...) @llvm.objc.clang.arc.noop.use(ptr %2)
+  %3 = call i64 @g2(ptr %2)
+  tail call void @objc_release(ptr %2)
+  %4 = tail call i64 @g3(i64 %3)
+  ret i64 %4
+}
+
+define i64 @f2(ptr %0) {
+  %2 = call ptr @g1(ptr %0, i32 0) minsize [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void (...) @llvm.objc.clang.arc.noop.use(ptr %2)
+  %3 = call i64 @g2(ptr %2)
+  tail call void @objc_release(ptr %2)
+  %4 = tail call i64 @g3(i64 %3)
+  ret i64 %4
+}
+
+declare ptr @g1(ptr, i32)
+declare i64 @g2(ptr)
+declare i64 @g3(i64)
+
+declare void @llvm.objc.clang.arc.noop.use(...)
+declare ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue(ptr)
+declare ptr @llvm.objc.retainAutoreleasedReturnValue(ptr)
+declare void @objc_release(ptr)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
new file mode 100644
index 0000000..11cf4c3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -0,0 +1,1136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot90(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot180(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot270(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
+  %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i64 @cdotp_i16_rot0(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot90(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot180(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot270(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
+  %real.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul.neg)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+
+define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @not_cdotp(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @not_cdotp(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i32 @not_cdotp(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i16 @invalid_type(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i16 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i16 @invalid_type(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i16 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i16 @invalid_type(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 8 x i16> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %partial.reduce.sub)
+  ret i16 %0
+}
+
+define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a)
+  %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b)
+  %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <16 x i8> %a.real to <16 x i32>
+  %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32>
+  %b.real.ext = sext <16 x i8> %b.real to <16 x i32>
+  %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32>
+  %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul)
+  %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
+
+declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 68cb29f..7542e9c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -29,3 +29,92 @@ bb193:                                            ; preds = %bb173
   store volatile i32 0, ptr null, align 4
   unreachable
 }
+
+; Check that the deinterleaving pass doesn't try to transform isolated patterns without a relevant deinterleaving pattern
+define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
+; CHECK-LABEL: check_deinterleaving_has_deinterleave:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    add x8, x0, #16
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp q17, q18, [x8, #-16]
+; CHECK-NEXT:    subs x9, x9, #32
+; CHECK-NEXT:    add x8, x8, #32
+; CHECK-NEXT:    cmeq v17.16b, v17.16b, #0
+; CHECK-NEXT:    cmeq v18.16b, v18.16b, #0
+; CHECK-NEXT:    ushll2 v19.8h, v17.16b, #0
+; CHECK-NEXT:    ushll v17.8h, v17.8b, #0
+; CHECK-NEXT:    ushll2 v20.8h, v18.16b, #0
+; CHECK-NEXT:    ushll v18.8h, v18.8b, #0
+; CHECK-NEXT:    ushll v21.4s, v19.4h, #0
+; CHECK-NEXT:    ushll2 v19.4s, v19.8h, #0
+; CHECK-NEXT:    ushll v22.4s, v17.4h, #0
+; CHECK-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-NEXT:    ushll2 v23.4s, v20.8h, #0
+; CHECK-NEXT:    ushll v24.4s, v18.4h, #0
+; CHECK-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-NEXT:    ushll v20.4s, v20.4h, #0
+; CHECK-NEXT:    and v21.16b, v21.16b, v1.16b
+; CHECK-NEXT:    and v19.16b, v19.16b, v1.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v1.16b
+; CHECK-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v1.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v1.16b
+; CHECK-NEXT:    and v18.16b, v18.16b, v1.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v1.16b
+; CHECK-NEXT:    add v4.4s, v4.4s, v19.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v22.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v17.4s
+; CHECK-NEXT:    add v16.4s, v16.4s, v23.4s
+; CHECK-NEXT:    add v5.4s, v5.4s, v24.4s
+; CHECK-NEXT:    add v6.4s, v6.4s, v20.4s
+; CHECK-NEXT:    add v7.4s, v7.4s, v18.4s
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %middle.block
+; CHECK-NEXT:    add v1.4s, v7.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v4.4s
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+    br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %9, %vector.body ]
+  %vec.phi50 = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+  %next.gep = getelementptr i8, ptr %a, i64 %index
+  %4 = getelementptr i8, ptr %next.gep, i64 16
+  %wide.load = load <16 x i8>, ptr %next.gep, align 1
+  %wide.load51 = load <16 x i8>, ptr %4, align 1
+  %5 = icmp eq <16 x i8> %wide.load, zeroinitializer
+  %6 = icmp eq <16 x i8> %wide.load51, zeroinitializer
+  %7 = zext <16 x i1> %5 to <16 x i32>
+  %8 = zext <16 x i1> %6 to <16 x i32>
+  %9 = add <16 x i32> %vec.phi, %7
+  %10 = add <16 x i32> %vec.phi50, %8
+  %index.next = add nuw i64 %index, 32
+  %11 = icmp eq i64 %index.next, 32
+  br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+  %bin.rdx = add <16 x i32> %10, %9
+  %12 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
+  ret i32 %12
+}
diff --git a/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll b/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
new file mode 100644
index 0000000..7c628cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i32 @eq_i32(i32 %x) {
+; CHECK-LABEL: eq_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ne_i32(i32 %x) {
+; CHECK-LABEL: ne_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sgt_i32(i32 %x) {
+; CHECK-LABEL: sgt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sge_i32(i32 %x) {
+; CHECK-LABEL: sge_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097153 // =0xffdfffff
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @slt_i32(i32 %x) {
+; CHECK-LABEL: slt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sle_i32(i32 %x) {
+; CHECK-LABEL: sle_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097151 // =0xffe00001
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ugt_i32(i32 %x) {
+; CHECK-LABEL: ugt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @uge_i32(i32 %x) {
+; CHECK-LABEL: uge_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w9, w0, #21
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w9, #2046
+; CHECK-NEXT:    csel w0, w0, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ult_i32(i32 %x) {
+; CHECK-LABEL: ult_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ule_i32(i32 %x) {
+; CHECK-LABEL: ule_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097151 // =0xffe00001
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+
+define i64 @eq_i64(i64 %x) {
+; CHECK-LABEL: eq_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ne_i64(i64 %x) {
+; CHECK-LABEL: ne_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sgt_i64(i64 %x) {
+; CHECK-LABEL: sgt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sge_i64(i64 %x) {
+; CHECK-LABEL: sge_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #99
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @slt_i64(i64 %x) {
+; CHECK-LABEL: slt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sle_i64(i64 %x) {
+; CHECK-LABEL: sle_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ugt_i64(i64 %x) {
+; CHECK-LABEL: ugt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @uge_i64(i64 %x) {
+; CHECK-LABEL: uge_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #99
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ult_i64(i64 %x) {
+; CHECK-LABEL: ult_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ule_i64(i64 %x) {
+; CHECK-LABEL: ule_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+
+define i64 @both(i64 %x) {
+; CHECK-LABEL: both:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    sub x9, x0, #100
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x8, x9, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i64 %x, 100
+  %sub1 = sub i64 100, %x
+  %sub2 = sub i64 %x, 100
+  %retval.0 = select i1 %cmp, i64 %sub1, i64 %sub2
+  ret i64 %retval.0
+}
+
+define i32 @qabs(i32 %0) {
+; CHECK-LABEL: qabs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    csneg w8, w8, w0, eq
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp1 = icmp sgt i32 %0, 0
+  %cmp2 = icmp eq i32 %0, -2147483648
+  %sub = sub nsw i32 0, %0
+  %cond = select i1 %cmp2, i32 2147483647, i32 %sub
+  %cond6 = select i1 %cmp1, i32 %0, i32 %cond
+  ret i32 %cond6
+}
diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
index 40684b0..e3263252 100644
--- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
@@ -76,11 +76,9 @@ entry:
 define bfloat @t7(bfloat %x)  {
 ; CHECK-LABEL: t7:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w9, s0
 ; CHECK-NEXT:    scvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -101,11 +99,9 @@ entry:
 define bfloat @t8(bfloat %x)  {
 ; CHECK-LABEL: t8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w9, s0
 ; CHECK-NEXT:    ucvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -198,11 +194,9 @@ entry:
 define bfloat @t7_strict(bfloat %x) #0 {
 ; CHECK-LABEL: t7_strict:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w9, s0
 ; CHECK-NEXT:    scvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -223,11 +217,9 @@ entry:
 define bfloat @t8_strict(bfloat %x) #0 {
 ; CHECK-LABEL: t8_strict:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w9, s0
 ; CHECK-NEXT:    ucvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
diff --git a/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll b/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll
new file mode 100644
index 0000000..dca39fe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -filetype asm -o - %s | FileCheck %s
+
+; This shows that when dereferencing a null pointer, HWASan will call
+; __hwasan_check_x4294967071_19_fixed_0_short_v2
+; (N.B. 4294967071 == 2**32 - 239 + 14 == 2**32 - X0 + XZR
+;
+; The source was generated from llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll.
+
+; ModuleID = '<stdin>'
+source_filename = "<stdin>"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+$hwasan.module_ctor = comdat any
+
+@__start_hwasan_globals = external hidden constant [0 x i8]
+@__stop_hwasan_globals = external hidden constant [0 x i8]
+@hwasan.note = private constant { i32, i32, i32, [8 x i8], i32, i32 } { i32 8, i32 8, i32 3, [8 x i8] c"LLVM\00\00\00\00", i32 trunc (i64 sub (i64 ptrtoint (ptr @__start_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @__stop_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32) }, section ".note.hwasan.globals", comdat($hwasan.module_ctor), align 4
+
+; Function Attrs: sanitize_hwaddress
+define void @test_store_to_zeroptr() #0 {
+; CHECK-LABEL: test_store_to_zeroptr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __hwasan_check_x4294967071_19_fixed_0_short_v2
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, #42 // =0x2a
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %.hwasan.shadow = call ptr asm "", "=r,0"(ptr null)
+  %b = inttoptr i64 0 to ptr
+  call void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr %b, i32 19, i64 0)
+  store i64 42, ptr %b, align 8
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr, i32 immarg, i64 immarg) #1
+
+attributes #0 = { sanitize_hwaddress }
+attributes #1 = { nounwind }
+
+declare void @__hwasan_init()
+
+; Function Attrs: nounwind
+define internal void @hwasan.module_ctor() #1 comdat {
+; CHECK-LABEL: hwasan.module_ctor:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __hwasan_init
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @__hwasan_init()
+  ret void
+}
+
+!llvm.module.flags = !{!1}
+
+!0 = !{ptr @hwasan.note}
+!1 = !{i32 4, !"nosanitize_hwaddress", i32 1}
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.ll b/llvm/test/CodeGen/AArch64/machine-combiner.ll
index 70a6388..c8df283 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner.ll
@@ -262,8 +262,8 @@ define half @reassociate_adds_half(half %x0, half %x1, half %x2, half %x3) {
 ; CHECK-UNSAFE-LABEL: reassociate_adds_half:
 ; CHECK-UNSAFE:       // %bb.0:
 ; CHECK-UNSAFE-NEXT:    fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT:    fadd h1, h3, h2
-; CHECK-UNSAFE-NEXT:    fadd h0, h1, h0
+; CHECK-UNSAFE-NEXT:    fadd h2, h3, h2
+; CHECK-UNSAFE-NEXT:    fadd h0, h2, h0
 ; CHECK-UNSAFE-NEXT:    ret
   %t0 = fdiv half %x0, %x1
   %t1 = fadd half %x2, %t0
@@ -284,8 +284,8 @@ define half @reassociate_muls_half(half %x0, half %x1, half %x2, half %x3) {
 ; CHECK-UNSAFE-LABEL: reassociate_muls_half:
 ; CHECK-UNSAFE:       // %bb.0:
 ; CHECK-UNSAFE-NEXT:    fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT:    fmul h1, h3, h2
-; CHECK-UNSAFE-NEXT:    fmul h0, h1, h0
+; CHECK-UNSAFE-NEXT:    fmul h2, h3, h2
+; CHECK-UNSAFE-NEXT:    fmul h0, h2, h0
 ; CHECK-UNSAFE-NEXT:    ret
   %t0 = fdiv half %x0, %x1
   %t1 = fmul half %x2, %t0
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 17f8263..a32c53a 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -313,9 +313,8 @@ define void @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N) {
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w9, w10
 ; CHECK-NEXT:    cset w9, hi
-; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    csinv w9, w9, wzr, hs
 ; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    sub w9, w9, w10
 ; CHECK-NEXT:    strb w9, [x2], #1
 ; CHECK-NEXT:    b.ne .LBB4_1
 ; CHECK-NEXT:  // %bb.2: // %for.exit
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
index 4bbbe40..e7de540 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
@@ -9,9 +9,9 @@ define void @a() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; CHECK-LABEL:         a:                     // @a
 ; CHECK:               // %bb.0:
 ; CHECK-NEXT:          .cfi_b_key_frame
+; CHECK-NEXT:          .cfi_negate_ra_state
 ; V8A-NEXT:            hint #27
 ; V83A-NEXT:           pacibsp
-; CHECK-NEXT:          .cfi_negate_ra_state
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
index 6a11bef..a26dda1 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
@@ -5,9 +5,9 @@
 
 define void @a() "sign-return-address"="all" {
 ; CHECK-LABEL:      a:                                     // @a
-; V8A:              hint #25
-; V83A:             paciasp
-; CHECK-NEXT:      .cfi_negate_ra_state
+; CHECK:      .cfi_negate_ra_state
+; V8A-NEXT:              hint #25
+; V83A-NEXT:             paciasp
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
@@ -52,9 +52,9 @@ define void @b() "sign-return-address"="non-leaf" {
 
 define void @c() "sign-return-address"="all" {
 ; CHECK-LABEL:         c:              // @c
-; V8A:                 hint #25
-; V83A:                paciasp
-; CHECK-NEXT          .cfi_negate_ra_state
+; CHECK:      .cfi_negate_ra_state
+; V8A-NEXT:              hint #25
+; V83A-NEXT:             paciasp
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
index 1e72246..064b2b7 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
@@ -8,8 +8,8 @@ define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -26,8 +26,8 @@ define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
@@ -59,8 +59,8 @@ define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -77,8 +77,8 @@ define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
@@ -110,8 +110,8 @@ define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -128,8 +128,8 @@ define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
index 9a983cb..218ee66 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
@@ -81,8 +81,8 @@ body:             |
 # CHECK:         name:            bar
 # CHECK:          bb.0:
 # CHECK:            frame-setup EMITBKEY
-# CHECK-NEXT:       frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:       frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:       frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:        OUTLINED_FUNCTION_
 # CHECK:          bb.1:
 # CHECK-NOT:        OUTLINED_FUNCTION_
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
index 87771f5..5c45373 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
@@ -7,8 +7,8 @@
 define void @a() "sign-return-address"="all" {
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0:
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -26,8 +26,8 @@ define void @a() "sign-return-address"="all" {
 ;
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
@@ -60,8 +60,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -80,8 +80,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
@@ -113,8 +113,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 define void @c() "sign-return-address"="all" {
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0:
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -132,8 +132,8 @@ define void @c() "sign-return-address"="all" {
 ;
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
index 22e5ede..d4a4b88 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
@@ -86,11 +86,11 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK:                BL @[[OUTLINED_FUNC:OUTLINED_FUNCTION_[0-9]+]]
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 ...
@@ -119,11 +119,11 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK:                BL @[[OUTLINED_FUNC]]
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 ...
@@ -174,22 +174,22 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 # CHECK-LABEL:    name:            illegal1
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 # Outlined function that contains only legal sp modifications
@@ -198,8 +198,8 @@ body:             |
 # CHECK-NEXT:       bb.0:
 # CHECK-NEXT: liveins: $lr
 # CHECK-NEXT: {{^  $}}
-# CHECK-NEXT:         frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:         frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:         frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:         $sp = frame-setup SUBXri $sp, 16, 0
 # CHECK:              $sp = frame-destroy ADDXri $sp, 16, 0
 # CHECK-NEXT:         frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
index a7ea329..cb43b3b 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
@@ -9,8 +9,8 @@ define void @a() #0 {
 ; CHECK-LABEL:      a:                                     // @a
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               pacibsp
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               pacibsp
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
@@ -33,8 +33,8 @@ define void @b() #0 {
 ; CHECK-LABEL:      b:                                     // @b
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               pacibsp
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               pacibsp
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
@@ -57,8 +57,8 @@ define void @c() #1 {
 ; CHECK-LABEL:      c:                                     // @c
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               hint #27
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               hint #27
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
index da68ea5..0ba4455 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
@@ -10,8 +10,8 @@ declare i32 @thunk_called_fn(i32, i32, i32, i32)
 define i32 @a() #0 {
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -27,8 +27,8 @@ define i32 @a() #0 {
 ;
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -49,8 +49,8 @@ entry:
 define i32 @b() #0 {
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -66,8 +66,8 @@ define i32 @b() #0 {
 ;
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -88,8 +88,8 @@ entry:
 define hidden i32 @c(ptr %fptr) #0 {
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -106,8 +106,8 @@ define hidden i32 @c(ptr %fptr) #0 {
 ;
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -129,8 +129,8 @@ entry:
 define hidden i32 @d(ptr %fptr) #0 {
 ; V8A-LABEL: d:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -147,8 +147,8 @@ define hidden i32 @d(ptr %fptr) #0 {
 ;
 ; V83A-LABEL: d:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -176,3 +176,5 @@ attributes #0 = { "sign-return-address"="non-leaf" minsize }
 ; CHECK-NOT:         .cfi_negate_ra_state
 ; CHECK-NOT:         auti{{[a,b]}}sp
 ; CHECK-NOT:         hint #{{[29,31]}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
index 4da7c8c..98ea86b 100644
--- a/llvm/test/CodeGen/AArch64/memcmp.ll
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -162,8 +162,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
   ret i32 %m
@@ -194,8 +193,7 @@ define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -259,6 +257,36 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
   ret i1 %c
 }
 
+define i1 @length4_le(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_le:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, ls
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 1
+  ret i1 %c
+}
+
+define i1 @length4_ge(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_ge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, -1
+  ret i1 %c
+}
+
 define i1 @length4_eq_const(ptr %X) nounwind {
 ; CHECK-LABEL: length4_eq_const:
 ; CHECK:       // %bb.0:
@@ -286,8 +314,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
   ret i32 %m
@@ -341,8 +368,7 @@ define i32 @length6(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
   ret i32 %m
@@ -375,18 +401,18 @@ define i32 @length7(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB24_3
+; CHECK-NEXT:    b.ne .LBB26_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur w8, [x0, #3]
 ; CHECK-NEXT:    ldur w9, [x1, #3]
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB24_3
+; CHECK-NEXT:    b.ne .LBB26_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB24_3: // %res_block
+; CHECK-NEXT:  .LBB26_3: // %res_block
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -403,18 +429,18 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB25_3
+; CHECK-NEXT:    b.ne .LBB27_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur w8, [x0, #3]
 ; CHECK-NEXT:    ldur w9, [x1, #3]
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB25_3
+; CHECK-NEXT:    b.ne .LBB27_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB25_3: // %res_block
+; CHECK-NEXT:  .LBB27_3: // %res_block
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -450,8 +476,7 @@ define i32 @length8(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
@@ -494,13 +519,13 @@ define i32 @length9(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB30_2
+; CHECK-NEXT:    b.ne .LBB32_2
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldrb w8, [x0, #8]
 ; CHECK-NEXT:    ldrb w9, [x1, #8]
 ; CHECK-NEXT:    sub w0, w8, w9
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB30_2: // %res_block
+; CHECK-NEXT:  .LBB32_2: // %res_block
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
 ; CHECK-NEXT:    ret
@@ -532,7 +557,7 @@ define i32 @length10(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
+; CHECK-NEXT:    b.ne .LBB34_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldrh w8, [x0, #8]
 ; CHECK-NEXT:    ldrh w9, [x1, #8]
@@ -541,11 +566,11 @@ define i32 @length10(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    lsr w8, w8, #16
 ; CHECK-NEXT:    lsr w9, w9, #16
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
+; CHECK-NEXT:    b.ne .LBB34_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB32_3: // %res_block
+; CHECK-NEXT:  .LBB34_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -578,18 +603,18 @@ define i32 @length11(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB34_3
+; CHECK-NEXT:    b.ne .LBB36_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur x8, [x0, #3]
 ; CHECK-NEXT:    ldur x9, [x1, #3]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB34_3
+; CHECK-NEXT:    b.ne .LBB36_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB34_3: // %res_block
+; CHECK-NEXT:  .LBB36_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -638,18 +663,18 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB37_3
+; CHECK-NEXT:    b.ne .LBB39_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr w8, [x0, #8]
 ; CHECK-NEXT:    ldr w9, [x1, #8]
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB37_3
+; CHECK-NEXT:    b.ne .LBB39_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB37_3: // %res_block
+; CHECK-NEXT:  .LBB39_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -698,18 +723,18 @@ define i32 @length15(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB40_3
+; CHECK-NEXT:    b.ne .LBB42_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur x8, [x0, #7]
 ; CHECK-NEXT:    ldur x9, [x1, #7]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB40_3
+; CHECK-NEXT:    b.ne .LBB42_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB40_3: // %res_block
+; CHECK-NEXT:  .LBB42_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -726,18 +751,18 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB41_3
+; CHECK-NEXT:    b.ne .LBB43_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur x8, [x0, #7]
 ; CHECK-NEXT:    ldur x9, [x1, #7]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB41_3
+; CHECK-NEXT:    b.ne .LBB43_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB41_3: // %res_block
+; CHECK-NEXT:  .LBB43_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -758,7 +783,7 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    movk x8, #12594, lsl #48
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
+; CHECK-NEXT:    b.ne .LBB44_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    mov x8, #13365 // =0x3435
 ; CHECK-NEXT:    ldur x9, [x0, #7]
@@ -767,11 +792,11 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    movk x8, #14393, lsl #48
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
+; CHECK-NEXT:    b.ne .LBB44_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB42_3: // %res_block
+; CHECK-NEXT:  .LBB44_3: // %res_block
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -806,7 +831,7 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    movk x8, #12594, lsl #48
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB44_3
+; CHECK-NEXT:    b.ne .LBB46_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    mov x8, #13365 // =0x3435
 ; CHECK-NEXT:    ldur x9, [x0, #7]
@@ -815,15 +840,15 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    movk x8, #14393, lsl #48
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB44_3
+; CHECK-NEXT:    b.ne .LBB46_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB44_4
-; CHECK-NEXT:  .LBB44_3: // %res_block
+; CHECK-NEXT:    b .LBB46_4
+; CHECK-NEXT:  .LBB46_3: // %res_block
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB44_4: // %endblock
+; CHECK-NEXT:  .LBB46_4: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -841,18 +866,18 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
+; CHECK-NEXT:    b.ne .LBB47_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
+; CHECK-NEXT:    b.ne .LBB47_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB45_3: // %res_block
+; CHECK-NEXT:  .LBB47_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -883,18 +908,18 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB47_3
+; CHECK-NEXT:    b.ne .LBB49_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB47_3
+; CHECK-NEXT:    b.ne .LBB49_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB47_3: // %res_block
+; CHECK-NEXT:  .LBB49_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -913,22 +938,22 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_3
+; CHECK-NEXT:    b.ne .LBB50_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_3
+; CHECK-NEXT:    b.ne .LBB50_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB48_4
-; CHECK-NEXT:  .LBB48_3: // %res_block
+; CHECK-NEXT:    b .LBB50_4
+; CHECK-NEXT:  .LBB50_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB48_4: // %endblock
+; CHECK-NEXT:  .LBB50_4: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -967,25 +992,25 @@ define i32 @length24(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:    b.ne .LBB52_4
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:    b.ne .LBB52_4
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:    b.ne .LBB52_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB50_4: // %res_block
+; CHECK-NEXT:  .LBB52_4: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1019,25 +1044,25 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
+; CHECK-NEXT:    b.ne .LBB54_4
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
+; CHECK-NEXT:    b.ne .LBB54_4
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
+; CHECK-NEXT:    b.ne .LBB54_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB52_4: // %res_block
+; CHECK-NEXT:  .LBB54_4: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1056,29 +1081,29 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
+; CHECK-NEXT:    b.ne .LBB55_4
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
+; CHECK-NEXT:    b.ne .LBB55_4
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
+; CHECK-NEXT:    b.ne .LBB55_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB53_5
-; CHECK-NEXT:  .LBB53_4: // %res_block
+; CHECK-NEXT:    b .LBB55_5
+; CHECK-NEXT:  .LBB55_4: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB53_5: // %endblock
+; CHECK-NEXT:  .LBB55_5: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -1122,32 +1147,32 @@ define i32 @length31(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:    b.ne .LBB57_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:    b.ne .LBB57_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:    b.ne .LBB57_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldur x8, [x0, #23]
 ; CHECK-NEXT:    ldur x9, [x1, #23]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:    b.ne .LBB57_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB55_5: // %res_block
+; CHECK-NEXT:  .LBB57_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1184,32 +1209,32 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
+; CHECK-NEXT:    b.ne .LBB59_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
+; CHECK-NEXT:    b.ne .LBB59_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
+; CHECK-NEXT:    b.ne .LBB59_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldur x8, [x0, #23]
 ; CHECK-NEXT:    ldur x9, [x1, #23]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
+; CHECK-NEXT:    b.ne .LBB59_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB57_5: // %res_block
+; CHECK-NEXT:  .LBB59_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1228,36 +1253,36 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
+; CHECK-NEXT:    b.ne .LBB60_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
+; CHECK-NEXT:    b.ne .LBB60_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
+; CHECK-NEXT:    b.ne .LBB60_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldur x8, [x0, #23]
 ; CHECK-NEXT:    ldur x9, [x1, #23]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
+; CHECK-NEXT:    b.ne .LBB60_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB58_6
-; CHECK-NEXT:  .LBB58_5: // %res_block
+; CHECK-NEXT:    b .LBB60_6
+; CHECK-NEXT:  .LBB60_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB58_6: // %endblock
+; CHECK-NEXT:  .LBB60_6: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -1327,32 +1352,32 @@ define i32 @length32(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:    b.ne .LBB63_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:    b.ne .LBB63_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:    b.ne .LBB63_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:    b.ne .LBB63_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB61_5: // %res_block
+; CHECK-NEXT:  .LBB63_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1388,32 +1413,32 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
+; CHECK-NEXT:    b.ne .LBB65_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
+; CHECK-NEXT:    b.ne .LBB65_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
+; CHECK-NEXT:    b.ne .LBB65_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
+; CHECK-NEXT:    b.ne .LBB65_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB63_5: // %res_block
+; CHECK-NEXT:  .LBB65_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1432,36 +1457,36 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
+; CHECK-NEXT:    b.ne .LBB66_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
+; CHECK-NEXT:    b.ne .LBB66_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
+; CHECK-NEXT:    b.ne .LBB66_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
+; CHECK-NEXT:    b.ne .LBB66_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB64_6
-; CHECK-NEXT:  .LBB64_5: // %res_block
+; CHECK-NEXT:    b .LBB66_6
+; CHECK-NEXT:  .LBB66_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB64_6: // %endblock
+; CHECK-NEXT:  .LBB66_6: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -1528,46 +1553,46 @@ define i32 @length48(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.6:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB67_7: // %res_block
+; CHECK-NEXT:  .LBB69_7: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1606,46 +1631,46 @@ define i1 @length48_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.6:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB69_7: // %res_block
+; CHECK-NEXT:  .LBB71_7: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1664,50 +1689,50 @@ define i1 @length48_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.6:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB70_8
-; CHECK-NEXT:  .LBB70_7: // %res_block
+; CHECK-NEXT:    b .LBB72_8
+; CHECK-NEXT:  .LBB72_7: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB70_8: // %endblock
+; CHECK-NEXT:  .LBB72_8: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -1785,60 +1810,60 @@ define i32 @length63(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldur x8, [x0, #55]
 ; CHECK-NEXT:    ldur x9, [x1, #55]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB73_9: // %res_block
+; CHECK-NEXT:  .LBB75_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1883,60 +1908,60 @@ define i1 @length63_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldur x8, [x0, #55]
 ; CHECK-NEXT:    ldur x9, [x1, #55]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB75_9: // %res_block
+; CHECK-NEXT:  .LBB77_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1955,64 +1980,64 @@ define i1 @length63_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldur x8, [x0, #55]
 ; CHECK-NEXT:    ldur x9, [x1, #55]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB76_10
-; CHECK-NEXT:  .LBB76_9: // %res_block
+; CHECK-NEXT:    b .LBB78_10
+; CHECK-NEXT:  .LBB78_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB76_10: // %endblock
+; CHECK-NEXT:  .LBB78_10: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -2076,60 +2101,60 @@ define i32 @length64(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldr x8, [x0, #56]
 ; CHECK-NEXT:    ldr x9, [x1, #56]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB78_9: // %res_block
+; CHECK-NEXT:  .LBB80_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -2172,60 +2197,60 @@ define i1 @length64_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldr x8, [x0, #56]
 ; CHECK-NEXT:    ldr x9, [x1, #56]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB80_9: // %res_block
+; CHECK-NEXT:  .LBB82_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -2244,64 +2269,64 @@ define i1 @length64_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldr x8, [x0, #56]
 ; CHECK-NEXT:    ldr x9, [x1, #56]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB81_10
-; CHECK-NEXT:  .LBB81_9: // %res_block
+; CHECK-NEXT:    b .LBB83_10
+; CHECK-NEXT:  .LBB83_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB81_10: // %endblock
+; CHECK-NEXT:  .LBB83_10: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index c1b9a4c..9ece9ed 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -367,6 +367,166 @@ entry:
   ret <4 x i64> %partial.reduce
 }
 
+define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <16 x i8> %a to <16 x i32>
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    saddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <16 x i8> %a to <16 x i32>
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <8 x i8> %a to <8 x i32>
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
+  ret <2 x i32> %partial.reduce
+}
+
+define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <8 x i8> %a to <8 x i32>
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
+  ret <2 x i32> %partial.reduce
+}
+
+define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll v4.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-NODOT-NEXT:    uaddl2 v4.2d, v3.4s, v5.4s
+; CHECK-NODOT-NEXT:    uaddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <16 x i8> %a to <16 x i64>
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
+  ret <4 x i64> %partial.reduce
+}
+
+define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll v4.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-NODOT-NEXT:    saddl2 v4.2d, v3.4s, v5.4s
+; CHECK-NODOT-NEXT:    saddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <16 x i8> %a to <16 x i64>
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
+  ret <4 x i64> %partial.reduce
+}
+
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0:
@@ -398,3 +558,91 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult)
   ret <2 x i32> %partial.reduce
 }
+
+define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: udot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    umull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    umlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <8 x i16> %a to <8 x i64>
+  %b.wide = zext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: sdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <8 x i16> %a to <8 x i64>
+  %b.wide = sext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: usdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <8 x i16> %a to <8 x i64>
+  %b.wide = sext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: sudot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <8 x i16> %a to <8 x i64>
+  %b.wide = zext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
index abdfb99..db5b932 100644
--- a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
@@ -23,19 +23,11 @@ entry:
 }
 
 define <4 x i32> @v4i32(<4 x i32> %a) {
-; CHECK-SD-LABEL: v4i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v4i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI2_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i32> %V128
@@ -52,19 +44,11 @@ entry:
 }
 
 define <8 x i16> @v8i16(<8 x i16> %a) {
-; CHECK-SD-LABEL: v8i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI4_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i16> %V128
@@ -93,6 +77,22 @@ entry:
   ret <8 x i16> %V128
 }
 
+define <4 x i16> @v8i16_3(<8 x i16> %a) {
+; CHECK-SD-LABEL: v8i16_3:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev64 v0.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i16_3:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %V128 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i16> %V128
+}
+
 define <4 x i16> @v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0: // %entry
@@ -104,19 +104,11 @@ entry:
 }
 
 define <16 x i8> @v16i8(<16 x i8> %a) {
-; CHECK-SD-LABEL: v16i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.16b, v0.16b
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v16i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI7_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI7_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x i8> %V128
@@ -125,18 +117,18 @@ entry:
 define <16 x i8> @v16i8_2(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-LABEL: v16i8_2:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    adrp x8, .LCPI8_0
+; CHECK-SD-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: v16i8_2:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
@@ -166,19 +158,11 @@ entry:
 }
 
 define <4 x float> @v4f32(<4 x float> %a) {
-; CHECK-SD-LABEL: v4f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v4f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %V128
@@ -195,19 +179,11 @@ entry:
 }
 
 define <8 x half> @v8f16(<8 x half> %a) {
-; CHECK-SD-LABEL: v8f16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8f16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x half> %V128
diff --git a/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll b/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
index 373c496..f823d2a 100644
--- a/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
+++ b/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
@@ -34,8 +34,8 @@ entry:
 }
 ;; CHECK-LABEL: __llvm_gcov_writeout:
 ;; CHECK:       .cfi_b_key_frame
-;; CHECK-NEXT:  pacibsp
 ;; CHECK-NEXT:  .cfi_negate_ra_state
+;; CHECK-NEXT:  pacibsp
 
 define internal void @__llvm_gcov_reset() unnamed_addr #2 {
 entry:
@@ -54,9 +54,9 @@ entry:
 }
 ;; CHECK-LABEL: __llvm_gcov_init:
 ;; CHECK:      .cfi_b_key_frame
-;; CHECK-NEXT:  pacibsp
 ;; CHECK-NEXT:  .cfi_negate_ra_state
 ;; CHECK-NOT:   .cfi_
+;; CHECK-NEXT:  pacibsp
 ;; CHECK:       .cfi_endproc
 
 attributes #0 = { norecurse nounwind readnone "sign-return-address"="all" "sign-return-address-key"="b_key" }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll b/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll
new file mode 100644
index 0000000..89731e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -filetype=obj < %s | llvm-readelf -r -s - | FileCheck --check-prefix=CHECK-OBJ %s
+; RUN: not --crash llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -global-isel=1 < %s 2>&1 | FileCheck --check-prefix=CHECK-ERR %s
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32, ptr @general_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:general_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+
+; CHECK-ERR: LLVM ERROR: cannot select: %1:gpr64sp(p0) = G_GLOBAL_VALUE @general_dynamic_var (in function: test_generaldynamic)
+}
+
+define ptr @test_generaldynamic_addr() {
+; CHECK-LABEL: test_generaldynamic_addr:
+
+  ret ptr @general_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:general_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: add x0, [[TP]], x0
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+;; Note: with signed TLSDESC, general dynamic model is always used,
+;; even when local dynamic is requested.
+
+@local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK-LABEL: test_localdynamic:
+
+  %val = load i32, ptr @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:local_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+define ptr @test_localdynamic_addr() {
+; CHECK-LABEL: test_localdynamic_addr:
+
+  ret ptr @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:local_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: add x0, x[[TPIDR]], x0
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+@extern_weak_var = extern_weak thread_local global i32
+
+define i32 @test_extern_weak() {
+; CHECK-LABEL: test_extern_weak:
+
+  %val = load i32, ptr @extern_weak_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:extern_weak_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:extern_weak_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:extern_weak_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+; CHECK-OBJ: 0000000000000000     0 TLS     WEAK   DEFAULT   UND extern_weak_var
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 8, !"ptrauth-elf-got", i32 1}
diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
new file mode 100644
index 0000000..f5df5ea
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_v2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    tbnz w9, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  .LBB0_3: // %middle.split
+; CHECK-NEXT:    and x0, x9, #0x1
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_v2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    tbnz w9, #0, .LBB1_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  .LBB1_3: // %middle.split
+; CHECK-NEXT:    tbz w9, #0, .LBB1_5
+; CHECK-NEXT:  // %bb.4: // %found
+; CHECK-NEXT:    mov w8, #56 // =0x38
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
+; CHECK-NEXT:  .LBB2_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    b.ne .LBB2_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    b.ne .LBB2_1
+; CHECK-NEXT:  .LBB2_3: // %middle.split
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
+; CHECK-NEXT:  .LBB3_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    b.ne .LBB3_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    b.ne .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %middle.split
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    b.eq .LBB3_5
+; CHECK-NEXT:  // %bb.4: // %found
+; CHECK-NEXT:    mov w8, #56 // =0x38
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_5:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
index b61fa4b..08fc47d 100644
--- a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
+++ b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
@@ -1,5 +1,5 @@
+# RUN: llc -mtriple=aarch64 -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG
 # RUN: llc -mtriple=aarch64 -verify-machineinstrs -o - -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking %s | FileCheck %s --check-prefix=CHECK
-# RUN: llc -mtriple=aarch64 -verify-machineinstrs -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG
 # REQUIRES: asserts
 
 # CHECK-DBG: ********** REGISTER COALESCER **********
@@ -36,3 +36,94 @@ body:             |
     RET_ReallyLR
 
 ...
+# CHECK-DBG: ********** REGISTER COALESCER **********
+# CHECK-DBG: ********** Function: reproducer
+# CHECK-DBG: ********** JOINING INTERVALS ***********
+# CHECK-DBG: ********** INTERVALS **********
+# CHECK-DBG: %1 [32r,48B:2)[48B,320r:0)[320r,368B:1) 0@48B-phi 1@320r 2@32r
+# CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: %3 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi
+# CHECK-DBG-SAME: L0000000000000080 [288r,304B:0)[304B,320r:3) 0@288r 1@x 2@x 3@304B-phi
+# CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi
+# CHECK-DBG-SAME: weight:0.000000e+00
+---
+name:              reproducer
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:gpr32 = MOVi32imm 1
+    %1:gpr64 = IMPLICIT_DEF
+
+  bb.1:
+
+  bb.2:
+    %3:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+
+  bb.3:
+    $nzcv = IMPLICIT_DEF
+    %4:gpr64 = COPY killed %3
+    Bcc 1, %bb.7, implicit killed $nzcv
+
+  bb.4:
+    $nzcv = IMPLICIT_DEF
+    Bcc 1, %bb.6, implicit killed $nzcv
+
+  bb.5:
+    %5:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+    %4:gpr64 = COPY killed %5
+    B %bb.7
+
+  bb.6:
+    %4:gpr64 = COPY $xzr
+
+  bb.7:
+    %7:gpr64 = ADDXrs killed %1, killed %4, 1
+    %1:gpr64 = COPY killed %7
+    B %bb.1
+
+...
+# CHECK-DBG: ********** REGISTER COALESCER **********
+# CHECK-DBG: ********** Function: reproducer2
+# CHECK-DBG: ********** JOINING INTERVALS ***********
+# CHECK-DBG: ********** INTERVALS **********
+# CHECK-DBG: %1 [32r,48B:2)[48B,304r:0)[304r,352B:1) 0@48B-phi 1@304r 2@32r
+# CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: %3 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi
+# CHECK-DBG-SAME: L0000000000000080 [224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@x 3@288B-phi
+# CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi
+# CHECK-DBG-SAME: weight:0.000000e+00
+---
+name:              reproducer2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:gpr32 = MOVi32imm 1
+    %1:gpr64 = IMPLICIT_DEF
+
+  bb.1:
+
+  bb.2:
+    %3:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+
+  bb.3:
+    $nzcv = IMPLICIT_DEF
+    %4:gpr64 = COPY killed %3
+    Bcc 1, %bb.7, implicit killed $nzcv
+
+  bb.4:
+    $nzcv = IMPLICIT_DEF
+    Bcc 1, %bb.6, implicit killed $nzcv
+
+  bb.5:
+    %4:gpr64 = IMPLICIT_DEF
+    B %bb.7
+
+  bb.6:
+    %4:gpr64 = COPY $xzr
+
+  bb.7:
+    %5:gpr64 = ADDXrs killed %1, killed %4, 1
+    %1:gpr64 = COPY killed %5
+    B %bb.1
+
+...
diff --git a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
index ec7548e..b7fae2b 100644
--- a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
@@ -7,19 +7,17 @@
 define i32 @testmswbf(bfloat %a) {
 ; CHECK-LABEL: testmswbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintm s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -31,19 +29,17 @@ entry:
 define i64 @testmsxbf(bfloat %a) {
 ; CHECK-LABEL: testmsxbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintm s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -141,19 +137,17 @@ entry:
 define i32 @testpswbf(bfloat %a) {
 ; CHECK-LABEL: testpswbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintp s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -165,19 +159,17 @@ entry:
 define i64 @testpsxbf(bfloat %a) {
 ; CHECK-LABEL: testpsxbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintp s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/selectopt.ll b/llvm/test/CodeGen/AArch64/selectopt.ll
index 54309dc..d72a956 100644
--- a/llvm/test/CodeGen/AArch64/selectopt.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt.ll
@@ -875,3 +875,124 @@ if.end:
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+declare i64 @payload(i64, ptr, ptr, i64)
+
+define void @outer_latch_heuristic(ptr %dst, ptr %src, i64 %p, i64 %dim) {
+; CHECKOO-LABEL: @outer_latch_heuristic(
+; CHECKOO-NEXT:  entry:
+; CHECKOO-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECKOO:       outer.loop:
+; CHECKOO-NEXT:    [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[SELECT_END:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECKOO-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
+; CHECKOO-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
+; CHECKOO-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
+; CHECKOO-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECKOO-NEXT:    [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
+; CHECKOO-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
+; CHECKOO-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECKOO:       inner.loop:
+; CHECKOO-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
+; CHECKOO-NEXT:    [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
+; CHECKOO-NEXT:    [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
+; CHECKOO-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECKOO-NEXT:    [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECKOO-NEXT:    br i1 [[EXITCOND_NOT_I_US]], label [[LATCH:%.*]], label [[INNER_LOOP]]
+; CHECKOO:       latch:
+; CHECKOO-NEXT:    [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
+; CHECKOO-NEXT:    [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
+; CHECKOO-NEXT:    [[CMP2_US_FROZEN:%.*]] = freeze i1 [[CMP2_US]]
+; CHECKOO-NEXT:    br i1 [[CMP2_US_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]]
+; CHECKOO:       select.true.sink:
+; CHECKOO-NEXT:    [[TMP2:%.*]] = add nsw i64 [[J]], 1
+; CHECKOO-NEXT:    br label [[SELECT_END]]
+; CHECKOO:       select.false.sink:
+; CHECKOO-NEXT:    [[TMP3:%.*]] = add nsw i64 1, [[I]]
+; CHECKOO-NEXT:    br label [[SELECT_END]]
+; CHECKOO:       select.end:
+; CHECKOO-NEXT:    [[I_NEXT]] = phi i64 [ [[I]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[J_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[J]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[COND_IN_US:%.*]] = phi ptr [ [[ARRAYIDX1_US]], [[SELECT_TRUE_SINK]] ], [ [[ARRAYIDX_US]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
+; CHECKOO-NEXT:    [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
+; CHECKOO-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
+; CHECKOO-NEXT:    store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
+; CHECKOO-NEXT:    [[INC7_US]] = add i64 [[K_020_US]], 1
+; CHECKOO-NEXT:    [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
+; CHECKOO-NEXT:    br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
+; CHECKOO:       exit:
+; CHECKOO-NEXT:    ret void
+;
+; CHECKII-LABEL: @outer_latch_heuristic(
+; CHECKII-NEXT:  entry:
+; CHECKII-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECKII:       outer.loop:
+; CHECKII-NEXT:    [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECKII-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
+; CHECKII-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
+; CHECKII-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
+; CHECKII-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECKII-NEXT:    [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
+; CHECKII-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
+; CHECKII-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECKII:       inner.loop:
+; CHECKII-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
+; CHECKII-NEXT:    [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
+; CHECKII-NEXT:    [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
+; CHECKII-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECKII-NEXT:    [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECKII-NEXT:    br i1 [[EXITCOND_NOT_I_US]], label [[LATCH]], label [[INNER_LOOP]]
+; CHECKII:       latch:
+; CHECKII-NEXT:    [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
+; CHECKII-NEXT:    [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
+; CHECKII-NEXT:    [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]]
+; CHECKII-NEXT:    [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
+; CHECKII-NEXT:    [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]]
+; CHECKII-NEXT:    [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]]
+; CHECKII-NEXT:    [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
+; CHECKII-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
+; CHECKII-NEXT:    store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
+; CHECKII-NEXT:    [[INC7_US]] = add i64 [[K_020_US]], 1
+; CHECKII-NEXT:    [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
+; CHECKII-NEXT:    br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
+; CHECKII:       exit:
+; CHECKII-NEXT:    ret void
+;
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %k.020.us = phi i64 [ %inc7.us, %latch ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %latch ], [ 0, %entry ]
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %arrayidx.us = getelementptr inbounds ptr, ptr %src, i64 %i
+  %4 = load ptr, ptr %arrayidx.us, align 8
+  %arrayidx1.us = getelementptr inbounds ptr, ptr %src, i64 %j
+  %5 = load ptr, ptr %arrayidx1.us, align 8
+  br label %inner.loop
+
+inner.loop:
+  %lsr.iv = phi i64 [ %dim, %outer.loop ], [ %lsr.iv.next, %inner.loop ]
+  %diff.04.i.us = phi i64 [ %call.i.us, %inner.loop ], [ 0, %outer.loop ]
+  %call.i.us = tail call i64 @payload(i64 %diff.04.i.us, ptr %4, ptr %5, i64 %p)
+  %lsr.iv.next = add i64 %lsr.iv, -1
+  %exitcond.not.i.us = icmp eq i64 %lsr.iv.next, 0
+  br i1 %exitcond.not.i.us, label %latch, label %inner.loop
+
+latch:
+  %cmp2.us = icmp sgt i64 %call.i.us, -1
+  %diff.0.lcssa.i.lobit.us = lshr i64 %call.i.us, 63
+  %i.next = add nsw i64 %diff.0.lcssa.i.lobit.us, %i
+  %inc4.us = zext i1 %cmp2.us to i64
+  %j.next = add nsw i64 %j, %inc4.us
+  %cond.in.us = select i1 %cmp2.us, ptr %arrayidx1.us, ptr %arrayidx.us
+  %cond.us = load ptr, ptr %cond.in.us, align 8
+  %arrayidx6.us = getelementptr inbounds ptr, ptr %dst, i64 %k.020.us
+  store ptr %cond.us, ptr %arrayidx6.us, align 8
+  %inc7.us = add i64 %k.020.us, 1
+  %exitcond23.not = icmp eq i64 %k.020.us, 1000
+  br i1 %exitcond23.not, label %exit, label %outer.loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/shuffle-select.ll b/llvm/test/CodeGen/AArch64/shuffle-select.ll
new file mode 100644
index 0000000..eeccaa1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shuffle-select.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
+
+define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+; CHECK-LABEL: sel_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8_poison(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <16 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8_unregular(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8_unregular:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %tmp0
+}
+
+define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+; CHECK-LABEL: sel_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    trn2 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i16> %tmp0
+}
+
+define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: sel_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI5_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x i16> %tmp0
+}
+
+define <2 x i32> @sel_v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: sel_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 3>
+  ret <2 x i32> %tmp0
+}
+
+define <4 x i32> @sel_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: sel_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %tmp0
+}
+
+define <2 x i64> @sel_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; CHECK-LABEL: sel_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %tmp0
+}
+
+define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
+; CHECK-LABEL: sel_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    trn2 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x half> %tmp0
+}
+
+define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
+; CHECK-LABEL: sel_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI10_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x half> %tmp0
+}
+
+define <2 x float> @sel_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: sel_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 3>
+  ret <2 x float> %tmp0
+}
+
+define <4 x float> @sel_v4f32(<4 x float> %v0, <4 x float> %v1) {
+; CHECK-LABEL: sel_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %tmp0
+}
+
+define <2 x double> @sel_v2f64(<2 x double> %v0, <2 x double> %v1) {
+; CHECK-LABEL: sel_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %tmp0
+}
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll b/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
index 4d4b7c2..6ea0728 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
@@ -9,8 +9,8 @@
 define dso_local i32 @_Z3fooi(i32 %x) #0 {
 ; CHECK-V8A-LABEL: _Z3fooi:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -27,8 +27,8 @@ define dso_local i32 @_Z3fooi(i32 %x) #0 {
 ;
 ; CHECK-V83A-LABEL: _Z3fooi:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -62,8 +62,8 @@ return:                                           ; No predecessors!
 define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-LABEL: baz_async:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -74,8 +74,8 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-NEXT:    bl _Z3bari
 ; CHECK-V8A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_restore w30
 ; CHECK-V8A-NEXT:    b _Z3bari
 ; CHECK-V8A-NEXT:  .LBB1_2: // %if.else
@@ -84,15 +84,15 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-NEXT:    add w0, w0, #1
 ; CHECK-V8A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_restore w30
 ; CHECK-V8A-NEXT:    ret
 ;
 ; CHECK-V83A-LABEL: baz_async:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -103,8 +103,8 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V83A-NEXT:    bl _Z3bari
 ; CHECK-V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V83A-NEXT:    autiasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    autiasp
 ; CHECK-V83A-NEXT:    .cfi_restore w30
 ; CHECK-V83A-NEXT:    b _Z3bari
 ; CHECK-V83A-NEXT:  .LBB1_2: // %if.else
@@ -143,8 +143,8 @@ return:                                           ; preds = %if.else, %if.then
 define hidden noundef i32 @baz_sync(i32 noundef %a) #0 uwtable(sync) {
 ; CHECK-V8A-LABEL: baz_sync:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -164,8 +164,8 @@ define hidden noundef i32 @baz_sync(i32 noundef %a) #0 uwtable(sync) {
 ;
 ; CHECK-V83A-LABEL: baz_sync:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -216,7 +216,7 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP-NOT: DW_CFA_remember_state
 ; CHECK-DUMP-NOT: DW_CFA_restore_state
 
-; CHECK-DUMP: CFA=WSP{{$}}
+; CHECK-DUMP: CFA=WSP
 ; CHECK-DUMP: reg34=1
 ; CHECK-DUMP-NOT: reg34=0
 
@@ -229,6 +229,7 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP:   DW_CFA_restore_state:
 ; CHECK-DUMP:   DW_CFA_AARCH64_negate_ra_state:
 
+; CHECK-DUMP: CFA=WSP
 ;; First DW_CFA_AARCH64_negate_ra_state:
 ; CHECK-DUMP: reg34=1
 ;; Second DW_CFA_AARCH64_negate_ra_state:
@@ -237,7 +238,6 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP: reg34=1
 ;; Third DW_CFA_AARCH64_negate_ra_state:
 ; CHECK-DUMP: reg34=0
-; CHECK-DUMP-NOT: reg34=1
 
 ; baz_sync
 ; CHECK-DUMP-LABEL: FDE
@@ -246,6 +246,6 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP-NOT: DW_CFA_remember_state
 ; CHECK-DUMP-NOT: DW_CFA_restore_state
 
-; CHECK-DUMP: CFA=WSP{{$}}
+; CHECK-DUMP: CFA=WSP
 ; CHECK-DUMP: reg34=1
 ; CHECK-DUMP-NOT: reg34=0
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
index fa689d2..f37f122 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
@@ -60,9 +60,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ; COMPAT-LABEL: leaf_sign_all:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp0:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp0
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; COMPAT-NEXT:    hint #39
@@ -72,9 +72,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ; V83A-LABEL: leaf_sign_all:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp0:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp0
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; V83A-NEXT:    hint #39
@@ -82,9 +82,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ;
 ; PAUTHLR-LABEL: leaf_sign_all:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp0:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp0
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; PAUTHLR-NEXT:    retaasppc .Ltmp0
@@ -95,9 +95,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ; COMPAT-LABEL: leaf_clobbers_lr:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp1:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -114,9 +114,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ; V83A-LABEL: leaf_clobbers_lr:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp1:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -131,9 +131,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ;
 ; PAUTHLR-LABEL: leaf_clobbers_lr:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp1:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -154,9 +154,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; COMPAT-LABEL: non_leaf_sign_all:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp2:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -171,9 +171,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; V83A-LABEL: non_leaf_sign_all:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp2:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -186,9 +186,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ;
 ; PAUTHLR-LABEL: non_leaf_sign_all:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp2:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -205,9 +205,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ; COMPAT-LABEL: non_leaf_sign_non_leaf:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp3:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -222,9 +222,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ; V83A-LABEL: non_leaf_sign_non_leaf:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp3:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -237,9 +237,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ;
 ; PAUTHLR-LABEL: non_leaf_sign_non_leaf:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp3:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -259,9 +259,9 @@ define i32 @non_leaf_scs(i32 %x) "branch-protection-pauth-lr" "sign-return-addre
 ; CHECK-NEXT:    str x30, [x18], #8
 ; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp4:
 ; CHECK-NEXT:    paciasp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -278,9 +278,9 @@ define i32 @non_leaf_scs(i32 %x) "branch-protection-pauth-lr" "sign-return-addre
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    str x30, [x18], #8
 ; PAUTHLR-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp4:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -299,9 +299,9 @@ define i32 @leaf_sign_all_v83(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; CHECK-LABEL: leaf_sign_all_v83:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp5:
 ; CHECK-NEXT:    paciasp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp5
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp5
 ; CHECK-NEXT:    hint #39
@@ -309,9 +309,9 @@ define i32 @leaf_sign_all_v83(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_v83:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp5:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp5
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp5
 ; PAUTHLR-NEXT:    retaasppc .Ltmp5
@@ -324,9 +324,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ; COMPAT-LABEL: spill_lr_and_tail_call:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp6:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -343,9 +343,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ; V83A-LABEL: spill_lr_and_tail_call:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp6:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -361,9 +361,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ;
 ; PAUTHLR-LABEL: spill_lr_and_tail_call:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp6:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -384,9 +384,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; COMPAT-LABEL: leaf_sign_all_a_key:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp7:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp7
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; COMPAT-NEXT:    hint #39
@@ -396,9 +396,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; V83A-LABEL: leaf_sign_all_a_key:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp7:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp7
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; V83A-NEXT:    hint #39
@@ -406,9 +406,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_a_key:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp7:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp7
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; PAUTHLR-NEXT:    retaasppc .Ltmp7
@@ -420,9 +420,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp8:
 ; COMPAT-NEXT:    hint #27
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp8
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; COMPAT-NEXT:    hint #39
@@ -433,9 +433,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp8:
 ; V83A-NEXT:    pacibsp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp8
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; V83A-NEXT:    hint #39
@@ -444,9 +444,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; PAUTHLR-LABEL: leaf_sign_all_b_key:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp8:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp8
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; PAUTHLR-NEXT:    retabsppc .Ltmp8
@@ -458,9 +458,9 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp9:
 ; CHECK-NEXT:    pacibsp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp9
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp9
 ; CHECK-NEXT:    hint #39
@@ -469,9 +469,9 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR-LABEL: leaf_sign_all_v83_b_key:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp9:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp9
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp9
 ; PAUTHLR-NEXT:    retabsppc .Ltmp9
@@ -484,9 +484,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #34
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp10:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp10
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; COMPAT-NEXT:    hint #39
@@ -497,9 +497,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #34
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp10:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp10
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; V83A-NEXT:    hint #39
@@ -508,9 +508,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR-LABEL: leaf_sign_all_a_key_bti:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp10:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp10
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; PAUTHLR-NEXT:    retaasppc .Ltmp10
@@ -524,9 +524,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; COMPAT-NEXT:    hint #34
 ; COMPAT-NEXT:    .cfi_b_key_frame
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp11:
 ; COMPAT-NEXT:    hint #27
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp11
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; COMPAT-NEXT:    hint #39
@@ -538,9 +538,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; V83A-NEXT:    hint #34
 ; V83A-NEXT:    .cfi_b_key_frame
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp11:
 ; V83A-NEXT:    pacibsp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp11
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; V83A-NEXT:    hint #39
@@ -550,9 +550,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp11:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp11
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; PAUTHLR-NEXT:    retabsppc .Ltmp11
@@ -566,9 +566,9 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "si
 ; CHECK-NEXT:    hint #34
 ; CHECK-NEXT:    .cfi_b_key_frame
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp12:
 ; CHECK-NEXT:    pacibsp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp12
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp12
 ; CHECK-NEXT:    hint #39
@@ -578,9 +578,9 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "si
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp12:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp12
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp12
 ; PAUTHLR-NEXT:    retabsppc .Ltmp12
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address.ll b/llvm/test/CodeGen/AArch64/sign-return-address.ll
index dafe0d7..e0ee0d8 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address.ll
@@ -29,15 +29,15 @@ define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: leaf_sign_all:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -45,8 +45,8 @@ define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
 define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
 ; COMPAT-LABEL: leaf_clobbers_lr:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -59,8 +59,8 @@ define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
 ;
 ; V83A-LABEL: leaf_clobbers_lr:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -78,8 +78,8 @@ declare i32 @foo(i32)
 define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: non_leaf_sign_all:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -90,8 +90,8 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ;
 ; V83A-LABEL: non_leaf_sign_all:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -105,8 +105,8 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ; COMPAT-LABEL: non_leaf_sign_non_leaf:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -117,8 +117,8 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ;
 ; V83A-LABEL: non_leaf_sign_non_leaf:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -135,8 +135,8 @@ define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstac
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x30, [x18], #8
 ; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
-; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -152,8 +152,8 @@ define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstac
 define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
 ; CHECK-LABEL: leaf_sign_all_v83:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    retaa
   ret i32 %x
 }
@@ -163,8 +163,8 @@ declare fastcc i64 @bar(i64)
 define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: spill_lr_and_tail_call:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -177,8 +177,8 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 ;
 ; V83A-LABEL: spill_lr_and_tail_call:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -196,15 +196,15 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
 ; COMPAT-LABEL: leaf_sign_all_a_key:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_a_key:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -213,16 +213,16 @@ define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return
 ; COMPAT-LABEL: leaf_sign_all_b_key:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    hint #31
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_b_key:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    retab
   ret i32 %x
 }
@@ -231,8 +231,8 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-
 ; CHECK-LABEL: leaf_sign_all_v83_b_key:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    retab
   ret i32 %x
 }
@@ -241,15 +241,15 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-
 define i32 @leaf_sign_all_a_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" "branch-target-enforcement"{
 ; COMPAT-LABEL: leaf_sign_all_a_key_bti:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_a_key_bti:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -259,16 +259,16 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "sign-return-address"="all" "sign-re
 ; COMPAT-LABEL: leaf_sign_all_b_key_bti:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    hint #31
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_b_key_bti:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    retab
   ret i32 %x
 }
@@ -278,8 +278,8 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "sign-return-address"="all" "tar
 ; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    retab
   ret i32 %x
 }
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll
new file mode 100644
index 0000000..5037772
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+
+
+define i1 @streaming_mode_streaming_compatible() #0 {
+; CHECK-LABEL: streaming_mode_streaming_compatible:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and w0, w0, #0x1
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %mode = tail call noundef i1 @llvm.aarch64.sme.in.streaming.mode()
+  ret i1 %mode
+}
+
+
+attributes #0 = {nounwind memory(none) "aarch64_pstate_sm_compatible"}
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 66d6e03..66f83c6 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -316,6 +316,84 @@ entry:
   ret <vscale x 4 x i64> %partial.reduce
 }
 
+define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: udot_no_bin_op:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: sdot_no_bin_op:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
+; CHECK-LABEL: udot_no_bin_op_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
+; CHECK-LABEL: sdot_no_bin_op_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: udot_no_bin_op_8to64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ret
+  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: sdot_no_bin_op_8to64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ret
+  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0: // %entry
@@ -419,3 +497,133 @@ entry:
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
+
+define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: udot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: sdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: usdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: sudot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 7f2eefe..7f3c1fd 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,26 +1,86 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; Basic tests from input vector to bitmask
 ; IR generated from clang for:
 ; __builtin_convertvector + reinterpret_cast<uint16&>
 
+; GISEL: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
+; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
+
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; CHECK-LABEL: convert_to_bitmask16:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x8, lCPI0_0@PAGE
-; CHECK-NEXT:    cmeq.16b v0, v0, #0
-; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+; SDAG-LABEL: convert_to_bitmask16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI0_0@PAGE
+; SDAG-NEXT:    cmeq.16b v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.16b v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[8]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[9]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[10]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #8
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[11]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #9
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[12]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #10
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[13]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #11
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[14]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #12
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[15]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #13
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #14
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #15
+; GISEL-NEXT:    strh w8, [sp, #14]
+; GISEL-NEXT:    and w0, w8, #0xffff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 ; Actual conversion
 
@@ -30,19 +90,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; CHECK-LABEL: convert_to_bitmask8:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x8, lCPI1_0@PAGE
-; CHECK-NEXT:    cmeq.8h v0, v0, #0
-; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
+; SDAG-LABEL: convert_to_bitmask8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI1_0@PAGE
+; SDAG-NEXT:    cmeq.8h v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w8, s0
+; SDAG-NEXT:    and w0, w8, #0xff
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.8h v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
@@ -52,18 +143,36 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 }
 
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
-; CHECK-LABEL: convert_to_bitmask4:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    adrp x8, lCPI2_0@PAGE
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh5
+; SDAG-LABEL: convert_to_bitmask4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI2_0@PAGE
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -74,17 +183,14 @@ define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 ; CHECK-LABEL: convert_to_bitmask2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh6:
 ; CHECK-NEXT:    adrp x8, lCPI3_0@PAGE
 ; CHECK-NEXT:    cmeq.2d v0, v0, #0
-; CHECK-NEXT:  Lloh7:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI3_0@PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addp.2d d0, v0
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x3
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
 
 
   %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
@@ -97,16 +203,13 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 ; CHECK-LABEL: clang_builtins_undef_concat_convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh8:
 ; CHECK-NEXT:    adrp x8, lCPI4_0@PAGE
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh9:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI4_0@PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh9
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -117,20 +220,37 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 
 
 define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_no_compare:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:  Lloh10:
-; CHECK-NEXT:    adrp x8, lCPI5_0@PAGE
-; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
-; CHECK-NEXT:    shl.4s v0, v0, #31
-; CHECK-NEXT:    cmlt.4s v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh11
+; SDAG-LABEL: convert_to_bitmask_no_compare:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    adrp x8, lCPI5_0@PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
+; SDAG-NEXT:    shl.4s v0, v0, #31
+; SDAG-NEXT:    cmlt.4s v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_no_compare:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    and.16b v0, v0, v1
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp = and <4 x i32> %vec1, %vec2
@@ -140,20 +260,39 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 }
 
 define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_compare_chain:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmeq.4s v2, v0, #0
-; CHECK-NEXT:    cmeq.4s v0, v0, v1
-; CHECK-NEXT:  Lloh12:
-; CHECK-NEXT:    adrp x8, lCPI6_0@PAGE
-; CHECK-NEXT:  Lloh13:
-; CHECK-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v0, v2
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
+; SDAG-LABEL: convert_to_bitmask_with_compare_chain:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmeq.4s v2, v0, #0
+; SDAG-NEXT:    cmeq.4s v0, v0, v1
+; SDAG-NEXT:    adrp x8, lCPI6_0@PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v0, v2
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_compare_chain:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v2, v0, #0
+; GISEL-NEXT:    cmeq.4s v0, v0, v1
+; GISEL-NEXT:    bic.16b v0, v0, v2
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -164,21 +303,39 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 }
 
 define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh14:
-; CHECK-NEXT:    adrp x8, lCPI7_0@PAGE
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
-; CHECK-NEXT:    shl.4s v0, v0, #31
-; CHECK-NEXT:    cmlt.4s v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh15
+; SDAG-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    adrp x8, lCPI7_0@PAGE
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
+; SDAG-NEXT:    shl.4s v0, v0, #31
+; SDAG-NEXT:    cmlt.4s v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    bic.16b v0, v1, v0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -189,33 +346,82 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 }
 
 define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:    adrp x8, lCPI8_0@PAGE
-; CHECK-NEXT:    movi d2, #0x000000ffffffff
-; CHECK-NEXT:    movi d3, #0x00ffffffffffff
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    movi d1, #0xffff0000ffff0000
-; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    orr.8b v0, v0, v2
-; CHECK-NEXT:    movi d2, #0x00ffffffff0000
-; CHECK-NEXT:    eor.8b v1, v0, v1
-; CHECK-NEXT:    eor.8b v0, v0, v2
-; CHECK-NEXT:    mov.h v1[2], wzr
-; CHECK-NEXT:    orr.8b v0, v0, v3
-; CHECK-NEXT:    orr.8b v0, v1, v0
-; CHECK-NEXT:  Lloh17:
-; CHECK-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
-; CHECK-NEXT:    shl.4h v0, v0, #15
-; CHECK-NEXT:    cmlt.4h v0, v0, #0
-; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:    addv.4h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh17
+; SDAG-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    adrp x8, lCPI8_0@PAGE
+; SDAG-NEXT:    movi d2, #0x000000ffffffff
+; SDAG-NEXT:    movi d3, #0x00ffffffffffff
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    movi d1, #0xffff0000ffff0000
+; SDAG-NEXT:    xtn.4h v0, v0
+; SDAG-NEXT:    orr.8b v0, v0, v2
+; SDAG-NEXT:    movi d2, #0x00ffffffff0000
+; SDAG-NEXT:    eor.8b v1, v0, v1
+; SDAG-NEXT:    eor.8b v0, v0, v2
+; SDAG-NEXT:    mov.h v1[2], wzr
+; SDAG-NEXT:    orr.8b v0, v0, v3
+; SDAG-NEXT:    orr.8b v0, v1, v0
+; SDAG-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
+; SDAG-NEXT:    shl.4h v0, v0, #15
+; SDAG-NEXT:    cmlt.4h v0, v0, #0
+; SDAG-NEXT:    and.8b v0, v0, v1
+; SDAG-NEXT:    addv.4h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    mov w8, #1 ; =0x1
+; GISEL-NEXT:    mov w9, #0 ; =0x0
+; GISEL-NEXT:    cmeq.4s v5, v0, #0
+; GISEL-NEXT:    fmov s2, w8
+; GISEL-NEXT:    fmov s4, w9
+; GISEL-NEXT:    cmeq.4s v1, v1, #0
+; GISEL-NEXT:    mov.16b v3, v2
+; GISEL-NEXT:    mov.16b v0, v4
+; GISEL-NEXT:    mov.h v4[1], w8
+; GISEL-NEXT:    bic.16b v1, v1, v5
+; GISEL-NEXT:    mov.16b v5, v2
+; GISEL-NEXT:    mov.h v2[1], w8
+; GISEL-NEXT:    mov.h v3[1], w8
+; GISEL-NEXT:    mov.h v0[1], w8
+; GISEL-NEXT:    mov.h v5[1], w8
+; GISEL-NEXT:    mov.h v4[2], w8
+; GISEL-NEXT:    xtn.4h v1, v1
+; GISEL-NEXT:    mov.h v2[2], w8
+; GISEL-NEXT:    mov.h v3[2], w9
+; GISEL-NEXT:    mov.h v0[2], w9
+; GISEL-NEXT:    mov.h v5[2], w9
+; GISEL-NEXT:    mov.h v4[3], w9
+; GISEL-NEXT:    mov.h v2[3], w9
+; GISEL-NEXT:    mov.h v3[3], w9
+; GISEL-NEXT:    mov.h v0[3], w8
+; GISEL-NEXT:    mov.h v5[3], w8
+; GISEL-NEXT:    orr.8b v1, v1, v3
+; GISEL-NEXT:    eor.8b v0, v1, v0
+; GISEL-NEXT:    eor.8b v1, v4, v1
+; GISEL-NEXT:    and.8b v0, v0, v5
+; GISEL-NEXT:    orr.8b v1, v2, v1
+; GISEL-NEXT:    orr.8b v0, v0, v1
+; GISEL-NEXT:    ushll.4s v0, v0, #0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -234,21 +440,42 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 }
 
 define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:    cmeq.4h v0, v0, #0
-; CHECK-NEXT:  Lloh18:
-; CHECK-NEXT:    adrp x8, lCPI9_0@PAGE
-; CHECK-NEXT:    xtn.4h v1, v1
-; CHECK-NEXT:    orn.8b v0, v1, v0
-; CHECK-NEXT:  Lloh19:
-; CHECK-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
-; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:    addv.4h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh19
+; SDAG-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    cmeq.4h v0, v0, #0
+; SDAG-NEXT:    adrp x8, lCPI9_0@PAGE
+; SDAG-NEXT:    xtn.4h v1, v1
+; SDAG-NEXT:    orn.8b v0, v1, v0
+; SDAG-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
+; SDAG-NEXT:    and.8b v0, v0, v1
+; SDAG-NEXT:    addv.4h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v1, v1, #0
+; GISEL-NEXT:    cmeq.4h v0, v0, #0
+; GISEL-NEXT:    xtn.4h v1, v1
+; GISEL-NEXT:    orn.8b v0, v1, v0
+; GISEL-NEXT:    ushll.4s v0, v0, #0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -259,21 +486,73 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 }
 
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
-; CHECK-LABEL: convert_to_bitmask_without_knowing_type:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:  Lloh20:
-; CHECK-NEXT:    adrp x8, lCPI10_0@PAGE
-; CHECK-NEXT:  Lloh21:
-; CHECK-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh21
+; SDAG-LABEL: convert_to_bitmask_without_knowing_type:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    shl.16b v0, v0, #7
+; SDAG-NEXT:    adrp x8, lCPI10_0@PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
+; SDAG-NEXT:    cmlt.16b v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_without_knowing_type:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[8]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[9]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[10]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #8
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[11]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #9
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[12]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #10
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[13]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #11
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[14]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #12
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[15]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #13
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #14
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #15
+; GISEL-NEXT:    strh w8, [sp, #14]
+; GISEL-NEXT:    and w0, w8, #0xffff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
@@ -282,16 +561,13 @@ define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
 define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_2xi32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh22:
 ; CHECK-NEXT:    adrp x8, lCPI11_0@PAGE
 ; CHECK-NEXT:    cmeq.2s v0, v0, #0
-; CHECK-NEXT:  Lloh23:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI11_0@PAGEOFF]
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addp.2s v0, v0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh23
 
   %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
   %bitmask = bitcast <2 x i1> %cmp_result to i2
@@ -299,19 +575,51 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 }
 
 define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
-; CHECK-LABEL: convert_to_bitmask_4xi8:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    bic.4h v0, #255, lsl #8
-; CHECK-NEXT:  Lloh24:
-; CHECK-NEXT:    adrp x8, lCPI12_0@PAGE
-; CHECK-NEXT:  Lloh25:
-; CHECK-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
-; CHECK-NEXT:    cmeq.4h v0, v0, #0
-; CHECK-NEXT:    bic.8b v0, v1, v0
-; CHECK-NEXT:    addv.4h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh25
+; SDAG-LABEL: convert_to_bitmask_4xi8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    bic.4h v0, #255, lsl #8
+; SDAG-NEXT:    adrp x8, lCPI12_0@PAGE
+; SDAG-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
+; SDAG-NEXT:    cmeq.4h v0, v0, #0
+; SDAG-NEXT:    bic.8b v0, v1, v0
+; SDAG-NEXT:    addv.4h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_4xi8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    mov w8, #0 ; =0x0
+; GISEL-NEXT:    uzp1.8b v0, v0, v0
+; GISEL-NEXT:    fmov s1, w8
+; GISEL-NEXT:    mov.b v1[1], w8
+; GISEL-NEXT:    mov.b v1[2], w8
+; GISEL-NEXT:    mov.b v1[3], w8
+; GISEL-NEXT:    cmeq.8b v0, v0, v1
+; GISEL-NEXT:    mvn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[0]
+; GISEL-NEXT:    umov.b w9, v0[1]
+; GISEL-NEXT:    mov.s v1[0], w8
+; GISEL-NEXT:    umov.b w8, v0[2]
+; GISEL-NEXT:    mov.s v1[1], w9
+; GISEL-NEXT:    umov.b w9, v0[3]
+; GISEL-NEXT:    mov.s v1[2], w8
+; GISEL-NEXT:    mov.s v1[3], w9
+; GISEL-NEXT:    mov.s w8, v1[1]
+; GISEL-NEXT:    mov.s w9, v1[2]
+; GISEL-NEXT:    fmov w11, s1
+; GISEL-NEXT:    mov.s w10, v1[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -322,17 +630,14 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_8xi2:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.8b v1, #3
-; CHECK-NEXT:  Lloh26:
 ; CHECK-NEXT:    adrp x8, lCPI13_0@PAGE
 ; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:  Lloh27:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI13_0@PAGEOFF]
 ; CHECK-NEXT:    cmeq.8b v0, v0, #0
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh26, Lloh27
 
   %cmp_result = icmp ne <8 x i2> %vec, zeroinitializer
   %bitmask = bitcast <8 x i1> %cmp_result to i8
@@ -340,20 +645,39 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 }
 
 define i4 @convert_to_bitmask_float(<4 x float> %vec) {
-; CHECK-LABEL: convert_to_bitmask_float:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    fcmgt.4s v1, v0, #0.0
-; CHECK-NEXT:    fcmlt.4s v0, v0, #0.0
-; CHECK-NEXT:  Lloh28:
-; CHECK-NEXT:    adrp x8, lCPI14_0@PAGE
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:  Lloh29:
-; CHECK-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh28, Lloh29
+; SDAG-LABEL: convert_to_bitmask_float:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    fcmgt.4s v1, v0, #0.0
+; SDAG-NEXT:    fcmlt.4s v0, v0, #0.0
+; SDAG-NEXT:    adrp x8, lCPI14_0@PAGE
+; SDAG-NEXT:    orr.16b v0, v0, v1
+; SDAG-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_float:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    fcmgt.4s v1, v0, #0.0
+; GISEL-NEXT:    fcmlt.4s v0, v0, #0.0
+; GISEL-NEXT:    orr.16b v0, v0, v1
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -364,24 +688,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; CHECK-LABEL: convert_large_vector:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh30:
-; CHECK-NEXT:    adrp x8, lCPI15_0@PAGE
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-NEXT:  Lloh31:
-; CHECK-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh31
+; SDAG-LABEL: convert_large_vector:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    adrp x8, lCPI15_0@PAGE
+; SDAG-NEXT:    uzp1.8h v0, v0, v1
+; SDAG-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w8, s0
+; SDAG-NEXT:    and w0, w8, #0xff
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_large_vector:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    cmeq.4s v1, v1, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mvn.16b v1, v1
+; GISEL-NEXT:    uzp1.8h v0, v0, v1
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
@@ -390,20 +748,40 @@ define i8 @convert_large_vector(<8 x i32> %vec) {
 }
 
 define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
-; CHECK-LABEL: convert_legalized_illegal_element_size:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.4s v1, #63, msl #16
-; CHECK-NEXT:  Lloh32:
-; CHECK-NEXT:    adrp x8, lCPI16_0@PAGE
-; CHECK-NEXT:    cmtst.4s v0, v0, v1
-; CHECK-NEXT:  Lloh33:
-; CHECK-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
-; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:    addv.4h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh33
+; SDAG-LABEL: convert_legalized_illegal_element_size:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    movi.4s v1, #63, msl #16
+; SDAG-NEXT:    adrp x8, lCPI16_0@PAGE
+; SDAG-NEXT:    cmtst.4s v0, v0, v1
+; SDAG-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
+; SDAG-NEXT:    xtn.4h v0, v0
+; SDAG-NEXT:    and.8b v0, v0, v1
+; SDAG-NEXT:    addv.4h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_legalized_illegal_element_size:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    movi.4s v1, #63, msl #16
+; GISEL-NEXT:    and.16b v0, v0, v1
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -415,7 +793,6 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-LABEL: no_direct_convert_for_bad_concat:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmtst.4s v0, v0, v0
-; CHECK-NEXT:  Lloh34:
 ; CHECK-NEXT:    adrp x8, lCPI17_0@PAGE
 ; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    umov.h w9, v0[0]
@@ -427,14 +804,12 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-NEXT:    umov.h w9, v0[3]
 ; CHECK-NEXT:    mov.b v1[7], w9
 ; CHECK-NEXT:    shl.8b v0, v1, #7
-; CHECK-NEXT:  Lloh35:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI17_0@PAGEOFF]
 ; CHECK-NEXT:    cmlt.8b v0, v0, #0
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh34, Lloh35
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   %vector_pad = shufflevector <4 x i1> poison, <4 x i1> %cmp_result, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
@@ -443,47 +818,101 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 }
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
-; CHECK-LABEL: no_convert_without_direct_bitcast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmtst.8h v0, v0, v0
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: no_convert_without_direct_bitcast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmtst.8h v0, v0, v0
+; SDAG-NEXT:    xtn.8b v0, v0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: no_convert_without_direct_bitcast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmeq.8h v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
 }
 
 define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
-; CHECK-LABEL: no_combine_illegal_num_elements:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    fmov s1, w4
-; CHECK-NEXT:    mov.s v0[1], w1
-; CHECK-NEXT:    mov.s v1[1], w5
-; CHECK-NEXT:    mov.s v0[2], w2
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:    mov.s v0[3], w3
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-NEXT:    mvn.16b v0, v0
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    umov.b w8, v0[0]
-; CHECK-NEXT:    umov.b w9, v0[1]
-; CHECK-NEXT:    umov.b w10, v0[2]
-; CHECK-NEXT:    and w8, w8, #0x1
-; CHECK-NEXT:    bfi w8, w9, #1, #1
-; CHECK-NEXT:    umov.b w9, v0[3]
-; CHECK-NEXT:    bfi w8, w10, #2, #1
-; CHECK-NEXT:    umov.b w10, v0[4]
-; CHECK-NEXT:    bfi w8, w9, #3, #1
-; CHECK-NEXT:    umov.b w9, v0[5]
-; CHECK-NEXT:    bfi w8, w10, #4, #1
-; CHECK-NEXT:    orr w8, w8, w9, lsl #5
-; CHECK-NEXT:    and w0, w8, #0x3f
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; SDAG-LABEL: no_combine_illegal_num_elements:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    fmov s0, w0
+; SDAG-NEXT:    fmov s1, w4
+; SDAG-NEXT:    mov.s v0[1], w1
+; SDAG-NEXT:    mov.s v1[1], w5
+; SDAG-NEXT:    mov.s v0[2], w2
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    mov.s v0[3], w3
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    uzp1.8h v0, v0, v1
+; SDAG-NEXT:    mvn.16b v0, v0
+; SDAG-NEXT:    xtn.8b v0, v0
+; SDAG-NEXT:    umov.b w8, v0[0]
+; SDAG-NEXT:    umov.b w9, v0[1]
+; SDAG-NEXT:    umov.b w10, v0[2]
+; SDAG-NEXT:    and w8, w8, #0x1
+; SDAG-NEXT:    bfi w8, w9, #1, #1
+; SDAG-NEXT:    umov.b w9, v0[3]
+; SDAG-NEXT:    bfi w8, w10, #2, #1
+; SDAG-NEXT:    umov.b w10, v0[4]
+; SDAG-NEXT:    bfi w8, w9, #3, #1
+; SDAG-NEXT:    umov.b w9, v0[5]
+; SDAG-NEXT:    bfi w8, w10, #4, #1
+; SDAG-NEXT:    orr w8, w8, w9, lsl #5
+; SDAG-NEXT:    and w0, w8, #0x3f
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: no_combine_illegal_num_elements:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    mov.s v0[0], w0
+; GISEL-NEXT:    mov.s v1[0], w4
+; GISEL-NEXT:    mov.s v2[0], wzr
+; GISEL-NEXT:    mov.s v0[1], w1
+; GISEL-NEXT:    mov.s v1[1], w5
+; GISEL-NEXT:    mov.s v2[1], wzr
+; GISEL-NEXT:    mov.s v0[2], w2
+; GISEL-NEXT:    cmeq.4s v1, v1, v2
+; GISEL-NEXT:    mvn.16b v1, v1
+; GISEL-NEXT:    mov.s v0[3], w3
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    mov.h v0[1], w8
+; GISEL-NEXT:    mov.s w8, v1[1]
+; GISEL-NEXT:    mov.h v0[2], w9
+; GISEL-NEXT:    mov.h v0[3], w10
+; GISEL-NEXT:    mov.h v0[4], v1[0]
+; GISEL-NEXT:    mov.h v0[5], w8
+; GISEL-NEXT:    umov.h w8, v0[1]
+; GISEL-NEXT:    umov.h w9, v0[0]
+; GISEL-NEXT:    umov.h w10, v0[2]
+; GISEL-NEXT:    umov.h w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.h w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.h w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w8, w8, #0x3f
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer
   %bitmask = bitcast <6 x i1> %cmp_result to i6
@@ -492,28 +921,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 
 ; Only apply the combine when casting a vector to a scalar.
 define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
-; CHECK-LABEL: vector_to_vector_cast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:  Lloh36:
-; CHECK-NEXT:    adrp x8, lCPI20_0@PAGE
-; CHECK-NEXT:  Lloh37:
-; CHECK-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
-; CHECK-NEXT:    add x8, sp, #14
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    str h0, [sp, #14]
-; CHECK-NEXT:    ld1.b { v0 }[0], [x8]
-; CHECK-NEXT:    orr x8, x8, #0x1
-; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh37
+; SDAG-LABEL: vector_to_vector_cast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    shl.16b v0, v0, #7
+; SDAG-NEXT:    adrp x8, lCPI20_0@PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
+; SDAG-NEXT:    add x8, sp, #14
+; SDAG-NEXT:    cmlt.16b v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    str h0, [sp, #14]
+; SDAG-NEXT:    ld1.b { v0 }[0], [x8]
+; SDAG-NEXT:    orr x8, x8, #0x1
+; SDAG-NEXT:    ld1.b { v0 }[4], [x8]
+; SDAG-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: vector_to_vector_cast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    mov d1, v0[1]
+; GISEL-NEXT:    umov.b w10, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w13, v0[0]
+; GISEL-NEXT:    umov.b w14, v0[2]
+; GISEL-NEXT:    umov.b w15, v0[3]
+; GISEL-NEXT:    umov.b w11, v0[2]
+; GISEL-NEXT:    umov.b w16, v0[4]
+; GISEL-NEXT:    umov.b w17, v0[5]
+; GISEL-NEXT:    umov.b w12, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w0, v1[1]
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    bfi w13, w10, #1, #31
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    umov.b w8, v1[0]
+; GISEL-NEXT:    umov.b w10, v1[2]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w13, w13, w14, lsl #2
+; GISEL-NEXT:    umov.b w14, v1[3]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #2
+; GISEL-NEXT:    orr w13, w13, w15, lsl #3
+; GISEL-NEXT:    umov.b w15, v1[4]
+; GISEL-NEXT:    umov.b w11, v0[6]
+; GISEL-NEXT:    bfi w8, w0, #1, #31
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w17, w17, #0x1
+; GISEL-NEXT:    orr w13, w13, w16, lsl #4
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    umov.b w0, v0[7]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #2
+; GISEL-NEXT:    umov.b w10, v1[5]
+; GISEL-NEXT:    umov.b w16, v1[6]
+; GISEL-NEXT:    orr w13, w13, w17, lsl #5
+; GISEL-NEXT:    umov.b w17, v0[4]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w8, w8, w14, lsl #3
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    umov.b w14, v1[7]
+; GISEL-NEXT:    orr w9, w9, w12, lsl #3
+; GISEL-NEXT:    orr w11, w13, w11, lsl #6
+; GISEL-NEXT:    orr w8, w8, w15, lsl #4
+; GISEL-NEXT:    umov.b w15, v0[5]
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    and w12, w17, #0x1
+; GISEL-NEXT:    umov.b w13, v0[1]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #5
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w12, lsl #4
+; GISEL-NEXT:    umov.b w10, v0[0]
+; GISEL-NEXT:    orr w11, w11, w0, lsl #7
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    and w12, w15, #0x1
+; GISEL-NEXT:    umov.b w15, v0[2]
+; GISEL-NEXT:    orr w8, w8, w16, lsl #6
+; GISEL-NEXT:    orr w9, w9, w12, lsl #5
+; GISEL-NEXT:    umov.b w12, v0[6]
+; GISEL-NEXT:    strb w11, [sp, #8]
+; GISEL-NEXT:    and w11, w13, #0x1
+; GISEL-NEXT:    umov.b w13, v0[3]
+; GISEL-NEXT:    orr w8, w8, w14, lsl #7
+; GISEL-NEXT:    umov.b w14, v0[7]
+; GISEL-NEXT:    ldr b0, [sp, #8]
+; GISEL-NEXT:    bfi w10, w11, #1, #31
+; GISEL-NEXT:    and w11, w15, #0x1
+; GISEL-NEXT:    strb w8, [sp, #9]
+; GISEL-NEXT:    umov.b w15, v0[4]
+; GISEL-NEXT:    and w8, w12, #0x1
+; GISEL-NEXT:    orr w10, w10, w11, lsl #2
+; GISEL-NEXT:    orr w8, w9, w8, lsl #6
+; GISEL-NEXT:    and w9, w13, #0x1
+; GISEL-NEXT:    umov.b w11, v0[1]
+; GISEL-NEXT:    orr w9, w10, w9, lsl #3
+; GISEL-NEXT:    umov.b w10, v0[5]
+; GISEL-NEXT:    umov.b w12, v0[0]
+; GISEL-NEXT:    and w13, w14, #0x1
+; GISEL-NEXT:    umov.b w16, v0[2]
+; GISEL-NEXT:    umov.b w17, v0[3]
+; GISEL-NEXT:    and w14, w15, #0x1
+; GISEL-NEXT:    umov.b w15, v0[2]
+; GISEL-NEXT:    orr w8, w8, w13, lsl #7
+; GISEL-NEXT:    orr w9, w9, w14, lsl #4
+; GISEL-NEXT:    umov.b w13, v0[6]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    umov.b w14, v0[3]
+; GISEL-NEXT:    strb w8, [sp, #10]
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    bfi w12, w11, #1, #31
+; GISEL-NEXT:    orr w8, w9, w8, lsl #5
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    and w9, w15, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    umov.b w15, v0[1]
+; GISEL-NEXT:    orr w9, w12, w9, lsl #2
+; GISEL-NEXT:    umov.b w12, v0[5]
+; GISEL-NEXT:    and w13, w13, #0x1
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    orr w8, w8, w13, lsl #6
+; GISEL-NEXT:    umov.b w13, v0[0]
+; GISEL-NEXT:    orr w9, w9, w14, lsl #3
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w14, v0[6]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    umov.b w0, v0[3]
+; GISEL-NEXT:    orr w9, w9, w10, lsl #4
+; GISEL-NEXT:    and w10, w12, #0x1
+; GISEL-NEXT:    umov.b w12, v0[7]
+; GISEL-NEXT:    orr w8, w8, w11, lsl #7
+; GISEL-NEXT:    bfi w13, w15, #1, #31
+; GISEL-NEXT:    and w11, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w10, lsl #5
+; GISEL-NEXT:    and w10, w14, #0x1
+; GISEL-NEXT:    umov.b w14, v0[4]
+; GISEL-NEXT:    strb w8, [sp, #11]
+; GISEL-NEXT:    umov.b w15, v0[1]
+; GISEL-NEXT:    umov.b w16, v0[3]
+; GISEL-NEXT:    orr w8, w9, w10, lsl #6
+; GISEL-NEXT:    orr w9, w13, w11, lsl #2
+; GISEL-NEXT:    and w10, w12, #0x1
+; GISEL-NEXT:    and w11, w17, #0x1
+; GISEL-NEXT:    umov.b w12, v0[5]
+; GISEL-NEXT:    umov.b w17, v0[0]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #7
+; GISEL-NEXT:    orr w9, w9, w11, lsl #3
+; GISEL-NEXT:    umov.b w10, v0[1]
+; GISEL-NEXT:    and w11, w14, #0x1
+; GISEL-NEXT:    umov.b w14, v0[0]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #4
+; GISEL-NEXT:    umov.b w11, v0[2]
+; GISEL-NEXT:    umov.b w13, v0[6]
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    bfi w17, w15, #1, #31
+; GISEL-NEXT:    umov.b w15, v0[5]
+; GISEL-NEXT:    orr w9, w9, w12, lsl #5
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w12, v0[2]
+; GISEL-NEXT:    bfi w14, w10, #1, #31
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    ldr b1, [sp, #9]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w13, w13, #0x1
+; GISEL-NEXT:    strb w8, [sp, #12]
+; GISEL-NEXT:    orr w11, w14, w11, lsl #2
+; GISEL-NEXT:    and w14, w16, #0x1
+; GISEL-NEXT:    umov.b w16, v0[4]
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w9, w9, w13, lsl #6
+; GISEL-NEXT:    orr w11, w11, w14, lsl #3
+; GISEL-NEXT:    orr w12, w17, w12, lsl #2
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w17, w0, #0x1
+; GISEL-NEXT:    umov.b w0, v0[5]
+; GISEL-NEXT:    umov.b w14, v0[6]
+; GISEL-NEXT:    orr w10, w11, w10, lsl #4
+; GISEL-NEXT:    orr w12, w12, w17, lsl #3
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    umov.b w17, v0[6]
+; GISEL-NEXT:    orr w10, w10, w15, lsl #5
+; GISEL-NEXT:    umov.b w15, v0[7]
+; GISEL-NEXT:    orr w12, w12, w16, lsl #4
+; GISEL-NEXT:    and w16, w0, #0x1
+; GISEL-NEXT:    umov.b w0, v0[7]
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    orr w12, w12, w16, lsl #5
+; GISEL-NEXT:    orr w10, w10, w14, lsl #6
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w13, w17, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #7
+; GISEL-NEXT:    mov.s v0[1], v1[0]
+; GISEL-NEXT:    orr w11, w12, w13, lsl #6
+; GISEL-NEXT:    and w12, w15, #0x1
+; GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; GISEL-NEXT:    orr w8, w10, w12, lsl #7
+; GISEL-NEXT:    and w10, w0, #0x1
+; GISEL-NEXT:    strb w9, [sp, #13]
+; GISEL-NEXT:    orr w9, w11, w10, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #14]
+; GISEL-NEXT:    strb w9, [sp, #15]
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
   ret <2 x i8> %bc
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 8473f45..5d6b523 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -72,6 +72,24 @@ entry:
   ret i64 %z
 }
 
+define i64 @add_v4i32_v4i64_zsext(<4 x i32> %xi) {
+; CHECK-LABEL: add_v4i32_v4i64_zsext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %x = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 0, i32 1>
+  %y = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 2, i32 3>
+  %xx = zext <2 x i32> %x to <2 x i64>
+  %yy = sext <2 x i32> %y to <2 x i64>
+  %zz = add <2 x i64> %xx, %yy
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %zz)
+  ret i64 %z
+}
+
 define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
 ; CHECK-LABEL: add_v2i32_v2i64_zext:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll
index 1caee99..510d457 100644
--- a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll
@@ -18,7 +18,7 @@ define <vscale x 2 x double> @test_svabs_f64_x_1(<vscale x 2 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fabs z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -34,7 +34,7 @@ define <vscale x 2 x double> @test_svabs_f64_x_2(<vscale x 2 x i1> %pg, double %
 ; CHECK-2p2-NEXT:    fabs z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -65,7 +65,7 @@ define <vscale x 4 x float> @test_svabs_f32_x_1(<vscale x 4 x i1> %pg, <vscale x
 ; CHECK-2p2-NEXT:    fabs z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -81,7 +81,7 @@ define <vscale x 4 x float> @test_svabs_f32_x_2(<vscale x 4 x i1> %pg, double %z
 ; CHECK-2p2-NEXT:    fabs z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -112,7 +112,7 @@ define <vscale x 8 x half> @test_svabs_f16_x_1(<vscale x 8 x i1> %pg, <vscale x
 ; CHECK-2p2-NEXT:    fabs z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -128,7 +128,7 @@ define <vscale x 8 x half> @test_svabs_f16_x_2(<vscale x 8 x i1> %pg, double %z0
 ; CHECK-2p2-NEXT:    fabs z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -159,7 +159,7 @@ define <vscale x 16 x i8> @test_svabs_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 1
 ; CHECK-2p2-NEXT:    abs z0.b, p0/z, z0.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
   ret <vscale x 16 x i8> %0
 }
 
@@ -175,8 +175,8 @@ define <vscale x 16 x i8> @test_svabs_s8_x_2(<vscale x 16 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    abs z0.b, p0/z, z1.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
-  ret <vscale x 16 x i8> %1
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
 }
 
 define <vscale x 16 x i8> @test_svabs_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
@@ -191,8 +191,8 @@ define <vscale x 16 x i8> @test_svabs_s8_z(<vscale x 16 x i1> %pg, double %z0, <
 ; CHECK-2p2-NEXT:    abs z0.b, p0/z, z1.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
-  ret <vscale x 16 x i8> %1
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
 }
 
 define <vscale x 8 x i16> @test_svabs_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
@@ -206,7 +206,7 @@ define <vscale x 8 x i16> @test_svabs_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8
 ; CHECK-2p2-NEXT:    abs z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -222,7 +222,7 @@ define <vscale x 8 x i16> @test_svabs_s16_x_2(<vscale x 8 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    abs z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -253,7 +253,7 @@ define <vscale x 4 x i32> @test_svabs_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4
 ; CHECK-2p2-NEXT:    abs z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -269,7 +269,7 @@ define <vscale x 4 x i32> @test_svabs_s32_x_2(<vscale x 4 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    abs z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -300,7 +300,7 @@ define <vscale x 2 x i64> @test_svabs_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2
 ; CHECK-2p2-NEXT:    abs z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -316,7 +316,7 @@ define <vscale x 2 x i64> @test_svabs_s64_x_2(<vscale x 2 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    abs z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -347,7 +347,7 @@ define <vscale x 2 x double> @test_svneg_f64_x_1(<vscale x 2 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fneg z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -363,7 +363,7 @@ define <vscale x 2 x double> @test_svneg_f64_x_2(<vscale x 2 x i1> %pg, double %
 ; CHECK-2p2-NEXT:    fneg z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -394,7 +394,7 @@ define <vscale x 4 x float> @test_svneg_f32_x_1(<vscale x 4 x i1> %pg, <vscale x
 ; CHECK-2p2-NEXT:    fneg z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -410,7 +410,7 @@ define <vscale x 4 x float> @test_svneg_f32_x_2(<vscale x 4 x i1> %pg, double %z
 ; CHECK-2p2-NEXT:    fneg z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -441,7 +441,7 @@ define <vscale x 8 x half> @test_svneg_f16_x_1(<vscale x 8 x i1> %pg, <vscale x
 ; CHECK-2p2-NEXT:    fneg z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -457,7 +457,7 @@ define <vscale x 8 x half> @test_svneg_f16_x_2(<vscale x 8 x i1> %pg, double %z0
 ; CHECK-2p2-NEXT:    fneg z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -488,7 +488,7 @@ define <vscale x 16 x i8> @test_svneg_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 1
 ; CHECK-2p2-NEXT:    neg z0.b, p0/z, z0.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
   ret <vscale x 16 x i8> %0
 }
 
@@ -504,8 +504,8 @@ define <vscale x 16 x i8> @test_svneg_s8_x_2(<vscale x 16 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    neg z0.b, p0/z, z1.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
-  ret <vscale x 16 x i8> %1
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
 }
 
 define <vscale x 16 x i8> @test_svneg_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
@@ -520,8 +520,8 @@ define <vscale x 16 x i8> @test_svneg_s8_z(<vscale x 16 x i1> %pg, double %z0, <
 ; CHECK-2p2-NEXT:    neg z0.b, p0/z, z1.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
-  ret <vscale x 16 x i8> %1
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
 }
 
 define <vscale x 8 x i16> @test_svneg_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
@@ -535,7 +535,7 @@ define <vscale x 8 x i16> @test_svneg_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8
 ; CHECK-2p2-NEXT:    neg z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -551,7 +551,7 @@ define <vscale x 8 x i16> @test_svneg_s16_x_2(<vscale x 8 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    neg z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -582,7 +582,7 @@ define <vscale x 4 x i32> @test_svneg_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4
 ; CHECK-2p2-NEXT:    neg z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -598,7 +598,7 @@ define <vscale x 4 x i32> @test_svneg_s32_x_2(<vscale x 4 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    neg z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -629,7 +629,7 @@ define <vscale x 2 x i64> @test_svneg_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2
 ; CHECK-2p2-NEXT:    neg z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -645,7 +645,7 @@ define <vscale x 2 x i64> @test_svneg_s64_x_2(<vscale x 2 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    neg z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -664,3 +664,535 @@ entry:
   %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
+
+define <vscale x 2 x double> @test_svfabs_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfabs_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fabs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fabs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfabs_f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfabs_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fabs z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fabs z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 4 x float> @test_svfabs_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfabs_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fabs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fabs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfabs_f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfabs_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fabs z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fabs z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 8 x half> @test_svfabs_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfabs_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fabs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fabs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfabs_f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfabs_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fabs z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fabs z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 16 x i8> @test_svabs_s8_ptrue_u(double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svabs_s8_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    abs z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s8_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    abs z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svabs_s8_ptrue(double %z0, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-LABEL: test_svabs_s8_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    abs z0.b, p0/m, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s8_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    abs z0.b, p0/z, z2.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svabs_s16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svabs_s16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    abs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    abs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nx84i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svabs_s16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svabs_s16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    abs z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    abs z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svabs_s32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svabs_s32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    abs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    abs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svabs_s32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svabs_s32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    abs z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    abs z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svabs_s64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svabs_s64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    abs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    abs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svabs_s64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svabs_s64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    abs z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    abs z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x double> @test_svfneg_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfneg_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fneg z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fneg z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfneg_f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfneg_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fneg z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fneg z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 4 x float> @test_svfneg_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfneg_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fneg z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fneg z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfneg_f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfneg_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fneg z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fneg z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 8 x half> @test_svfneg_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfneg_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fneg z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fneg z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfneg_f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfneg_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fneg z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fneg z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 16 x i8> @test_svneg_s8_ptrue_u(double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svneg_s8_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    neg z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s8_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    neg z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svneg_s8_ptrue(double %z0, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-LABEL: test_svneg_s8_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    neg z0.b, p0/m, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s8_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    neg z0.b, p0/z, z2.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svneg_s16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svneg_s16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    neg z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    neg z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svneg_s16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svneg_s16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    neg z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    neg z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svneg_s32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svneg_s32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    neg z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    neg z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svneg_s32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svneg_s32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    neg z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    neg z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svneg_s64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svneg_s64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    neg z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    neg z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svneg_s64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svneg_s64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    neg z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    neg z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll
index cf9ac49..855bf9a 100644
--- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll
@@ -18,7 +18,7 @@ define <vscale x 8 x half> @test_svcvt_f16_f32_x_1(<vscale x 4 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -33,7 +33,7 @@ define <vscale x 8 x half> @test_svcvt_f16_f32_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -64,7 +64,7 @@ define <vscale x 8 x bfloat> @test_svcvt_bf16_f32_x_1(<vscale x 4 x i1> %pg, <vs
 ; CHECK-2p2-NEXT:    bfcvt z0.h, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 8 x bfloat> %0
 }
 
@@ -79,7 +79,7 @@ define <vscale x 8 x bfloat> @test_svcvt_bf16_f32_x_2(<vscale x 4 x i1> %pg, dou
 ; CHECK-2p2-NEXT:    bfcvt z0.h, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 8 x bfloat> %0
 }
 
@@ -110,7 +110,7 @@ define <vscale x 8 x half> @test_svcvt_f16_f64_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -125,7 +125,7 @@ define <vscale x 8 x half> @test_svcvt_f16_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -156,7 +156,7 @@ define <vscale x 4 x float> @test_svcvt_f32_f64_x_1(<vscale x 2 x i1> %pg, <vsca
 ; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -171,7 +171,7 @@ define <vscale x 4 x float> @test_svcvt_f32_f64_x_2(<vscale x 2 x i1> %pg, doubl
 ; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -202,7 +202,7 @@ define <vscale x 4 x float> @test_svcvt_f32_f16_x_1(<vscale x 4 x i1> %pg, <vsca
 ; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -217,7 +217,7 @@ define <vscale x 4 x float> @test_svcvt_f32_f16_x_2(<vscale x 4 x i1> %pg, doubl
 ; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -248,7 +248,7 @@ define <vscale x 2 x double> @test_svcvt_f64_f16_x_1(<vscale x 2 x i1> %pg, <vsc
 ; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -263,7 +263,7 @@ define <vscale x 2 x double> @test_svcvt_f64_f16_x_2(<vscale x 2 x i1> %pg, doub
 ; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -294,7 +294,7 @@ define <vscale x 2 x double> @test_svcvt_f64_f32_x_1(<vscale x 2 x i1> %pg, <vsc
 ; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -309,7 +309,7 @@ define <vscale x 2 x double> @test_svcvt_f64_f32_x_2(<vscale x 2 x i1> %pg, doub
 ; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -328,3 +328,262 @@ entry:
   %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
+
+define <vscale x 8 x half> @test_svcvt_f16_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svcvt_f16_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f16_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svcvt_f16_f32_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 4 x float> %y ) {
+; CHECK-LABEL: test_svcvt_f16_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f16_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x bfloat> @test_svcvt_bf16_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svcvt_bf16_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    bfcvt z0.h, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_bf16_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    bfcvt z0.h, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x bfloat> @test_svcvt_bf16_f32_ptrue(double %z0, <vscale x 8 x bfloat> %x, <vscale x 4 x float> %y ) {
+; CHECK-LABEL: test_svcvt_bf16_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    bfcvt z0.h, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_bf16_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    bfcvt z0.h, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x half> @test_svcvt_f16_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svcvt_f16_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f16_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svcvt_f16_f64_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 2 x double> %y ) {
+; CHECK-LABEL: test_svcvt_f16_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f16_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x float> @test_svcvt_f32_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svcvt_f32_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f32_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvt_f32_f64_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 2 x double> %y ) {
+; CHECK-LABEL: test_svcvt_f32_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f32_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvt_f32_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svcvt_f32_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.s, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f32_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvt_f32_f16_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 8 x half> %y ) {
+; CHECK-LABEL: test_svcvt_f32_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f32_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svcvt_f64_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svcvt_f64_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f64_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svcvt_f64_f16_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 8 x half> %y ) {
+; CHECK-LABEL: test_svcvt_f64_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f64_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svcvt_f64_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svcvt_f64_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f64_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svcvt_f64_f32_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 4 x float> %y ) {
+; CHECK-LABEL: test_svcvt_f64_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f64_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 2 x double> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll
index 60879b1..c7431e1 100644
--- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll
@@ -18,7 +18,7 @@ define <vscale x 4 x float> @test_svcvtlt_f32_f16_x_1(<vscale x 4 x i1> %pg, <vs
 ; CHECK-2p2-NEXT:    fcvtlt z0.s, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -33,7 +33,7 @@ define <vscale x 4 x float> @test_svcvtlt_f32_f16_x_2(<vscale x 4 x i1> %pg, dou
 ; CHECK-2p2-NEXT:    fcvtlt z0.s, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -64,7 +64,7 @@ define <vscale x 2 x double> @test_svcvtlt_f64_f32_x_1(<vscale x 2 x i1> %pg, <v
 ; CHECK-2p2-NEXT:    fcvtlt z0.d, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -79,7 +79,7 @@ define <vscale x 2 x double> @test_svcvtlt_f64_f32_x_2(<vscale x 2 x i1> %pg, do
 ; CHECK-2p2-NEXT:    fcvtlt z0.d, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -110,7 +110,7 @@ define <vscale x 4 x float> @test_svcvtx_f32_f64_x_1(<vscale x 2 x i1> %pg, <vsc
 ; CHECK-2p2-NEXT:    fcvtx z0.s, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -125,7 +125,7 @@ define <vscale x 4 x float> @test_svcvtx_f32_f64_x_2(<vscale x 2 x i1> %pg, doub
 ; CHECK-2p2-NEXT:    fcvtx z0.s, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -144,3 +144,114 @@ entry:
   %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
+
+define <vscale x 4 x float> @test_svcvtlt_f32_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svcvtlt_f32_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtlt z0.s, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtlt_f32_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtlt z0.s, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvtlt_f32_f16_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svcvtlt_f32_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtlt z0.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtlt_f32_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtlt z0.s, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svcvtlt_f64_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svcvtlt_f64_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtlt z0.d, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtlt_f64_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtlt z0.d, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svcvtlt_f64_f32_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svcvtlt_f64_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtlt z0.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtlt_f64_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtlt z0.d, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 4 x float> @test_svcvtx_f32_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svcvtx_f32_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtx z0.s, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtx_f32_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtx z0.s, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvtx_f32_f64_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svcvtx_f32_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtx z0.s, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtx_f32_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtx z0.s, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 4 x float> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll
index b8b36d3..7259502 100644
--- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll
@@ -18,7 +18,7 @@ define <vscale x 4 x i32> @test_fcvtzs_s32_f64_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -33,7 +33,7 @@ define <vscale x 4 x i32> @test_fcvtzs_s32_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -64,7 +64,7 @@ define <vscale x 2 x i64> @test_fcvtzs_s64_f32_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -79,7 +79,7 @@ define <vscale x 2 x i64> @test_fcvtzs_s64_f32_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -110,7 +110,7 @@ define <vscale x 4 x i32> @test_fcvtzs_s32_f16_x_1(<vscale x 4 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -125,7 +125,7 @@ define <vscale x 4 x i32> @test_fcvtzs_s32_f16_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -156,7 +156,7 @@ define <vscale x 2 x i64> @test_fcvtzs_s64_f16_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -171,7 +171,7 @@ define <vscale x 2 x i64> @test_fcvtzs_s64_f16_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -202,7 +202,7 @@ define <vscale x 4 x i32> @test_fcvtzu_u32_f64_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -217,7 +217,7 @@ define <vscale x 4 x i32> @test_fcvtzu_u32_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -248,7 +248,7 @@ define <vscale x 2 x i64> @test_fcvtzu_u64_f32_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -263,7 +263,7 @@ define <vscale x 2 x i64> @test_fcvtzu_u64_f32_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -294,7 +294,7 @@ define <vscale x 4 x i32> @test_fcvtzu_u32_f16_x_1(<vscale x 4 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -309,7 +309,7 @@ define <vscale x 4 x i32> @test_fcvtzu_u32_f16_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -340,7 +340,7 @@ define <vscale x 2 x i64> @test_fcvtzu_u64_f16_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -355,7 +355,7 @@ define <vscale x 2 x i64> @test_fcvtzu_u64_f16_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -387,7 +387,7 @@ define <vscale x 8 x i16> @test_svcvt_s16_f16_x_1(<vscale x 8 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzs z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -403,7 +403,7 @@ define <vscale x 8 x i16> @test_svcvt_s16_f16_x_2(<vscale x 8 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -434,7 +434,7 @@ define <vscale x 8 x i16> @test_svcvt_u16_f16_x_1(<vscale x 8 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzu z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -450,7 +450,7 @@ define <vscale x 8 x i16> @test_svcvt_u16_f16_x_2(<vscale x 8 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -481,7 +481,7 @@ define <vscale x 4 x i32> @test_svcvt_s32_f32_x_1(<vscale x 4 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -497,7 +497,7 @@ define <vscale x 4 x i32> @test_svcvt_s32_f32_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -528,7 +528,7 @@ define <vscale x 4 x i32> @test_svcvt_u32_f32_x_1(<vscale x 4 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -544,7 +544,7 @@ define <vscale x 4 x i32> @test_svcvt_u32_f32_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -575,7 +575,7 @@ define <vscale x 2 x i64> @test_svcvt_s64_f64_x_1(<vscale x 2 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -591,7 +591,7 @@ define <vscale x 2 x i64> @test_svcvt_s64_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -622,7 +622,7 @@ define <vscale x 2 x i64> @test_svcvt_u64_f64_x_1(<vscale x 2 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -638,7 +638,7 @@ define <vscale x 2 x i64> @test_svcvt_u64_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -657,3 +657,527 @@ entry:
   %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_fcvtzs_i32_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f64_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_fcvtzs_i32_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_fcvtzu_i32_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f64_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_fcvtzu_i32_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_fcvtzs_i64_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f32_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_fcvtzs_i64_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_fcvtzu_i64_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f32_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_fcvtzu_i64_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzs_i32_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f16_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzs_i32_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzu_i32_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f16_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzu_i32_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzs_i64_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f16_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzs_i64_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzu_i64_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f16_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzu_i64_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_fcvtzs_i16_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzs_i16_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i16_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fcvtzs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_fcvtzs_i16_f16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzs_i16_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i16_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fcvtzs z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_fcvtzu_i16_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzu_i16_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i16_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fcvtzu z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_fcvtzu_i16_f16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzu_i16_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i16_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fcvtzu z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_fcvtzs_i32_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_fcvtzs_i32_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_fcvtzu_i32_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_fcvtzu_i32_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_fcvtzs_i64_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_fcvtzs_i64_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_fcvtzu_i64_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_fcvtzu_i64_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x i64> %0
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
deleted file mode 100644
index aefcad4..0000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, align 4, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, align 64, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) {
-  %alloca = alloca i32, i32 %n, align 4, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) {
-  %alloca = alloca i32, i32 %n, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: func_dynamic_stackalloc_vgpr_align64)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align64
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align64 void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_align64(i32 %n) {
-  %alloca = alloca i32, i32 %n, align 64, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index b0f2aac..7cafa2f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -3990,6 +3990,116 @@ bb:
   ret void
 }
 
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    v_add3_u32 v0, s2, v0, -16
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX940:       ; %bb.0: ; %bb
+; GFX940-NEXT:    v_add_u32_e32 v0, s1, v0
+; GFX940-NEXT:    v_add3_u32 v0, s0, v0, -16
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; UNALIGNED_GFX9-NEXT:    v_add3_u32 v0, s2, v0, -16
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v0, v1, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s0, s5
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, s1, v0
+; UNALIGNED_GFX940-NEXT:    v_add3_u32 v0, s0, v0, -16
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
+bb:
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, -16
+  %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  ret void
+}
+
 define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
 ; GFX9-LABEL: sgpr_base_negative_offset:
 ; GFX9:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 0577117..d81faf9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -10,10 +10,10 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -25,19 +25,19 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -49,19 +49,19 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -73,17 +73,18 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -91,32 +92,34 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -127,111 +130,114 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX6-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_not_b32_e32 v4, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v4
 ; GFX8-NEXT:    v_log_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_not_b32_e32 v4, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v4
 ; GFX9-NEXT:    v_log_f32_e32 v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_not_b32_e32 v4, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f32:
@@ -239,10 +245,12 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
@@ -257,46 +265,54 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s4
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    v_log_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v4, 5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s0
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y)
   ret <2 x float> %pow
@@ -316,9 +332,9 @@ define half @v_pow_f16(half %x, half %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -388,18 +404,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; GFX6-NEXT:    v_not_b32_e32 v3, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v3, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v6
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -508,17 +524,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v1, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -634,17 +650,17 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -764,17 +780,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -885,10 +901,10 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -900,19 +916,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -924,19 +940,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -948,17 +964,18 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, |v0|
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -966,9 +983,9 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs:
@@ -976,23 +993,24 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %pow = call float @llvm.pow.f32(float %fabs.x, float %y)
@@ -1004,10 +1022,10 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1019,19 +1037,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1043,19 +1061,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1067,17 +1085,18 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1085,32 +1104,34 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_rhs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %pow = call float @llvm.pow.f32(float %x, float %fabs.y)
@@ -1122,10 +1143,10 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1137,19 +1158,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1161,19 +1182,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1185,17 +1206,18 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, |v0|
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1203,9 +1225,9 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs:
@@ -1213,23 +1235,24 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, |v1|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
@@ -1241,10 +1264,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1256,18 +1279,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1279,18 +1302,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX9-NEXT:    v_log_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1302,49 +1325,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s1
-; GFX10-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX10-NEXT:    v_log_f32_e32 v1, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s1
-; GFX11-NEXT:    v_mul_f32_e32 v1, s0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX11-NEXT:    v_log_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1354,10 +1379,10 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX6-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1369,18 +1394,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1392,18 +1417,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1415,16 +1440,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1432,31 +1458,33 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1466,10 +1494,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX6-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1481,18 +1509,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX8-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1504,18 +1532,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX9-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1527,49 +1555,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX11-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, s1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1580,10 +1610,10 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1595,19 +1625,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1619,19 +1649,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fneg_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1643,17 +1673,18 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fneg_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, -v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1661,9 +1692,9 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fneg_lhs:
@@ -1671,23 +1702,24 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %pow = call float @llvm.pow.f32(float %neg.x, float %y)
@@ -1699,10 +1731,10 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1714,19 +1746,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1738,19 +1770,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1762,17 +1794,18 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fneg_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1780,32 +1813,34 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fneg_rhs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.y = fneg float %y
   %pow = call float @llvm.pow.f32(float %x, float %neg.y)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
index 55015c6d..cdb67ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
@@ -20,8 +20,8 @@ body: |
     ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
     ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
-    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
-    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]]
     ;
     ; GFX11-FAKE16-LABEL: name: fcmp_false_f16
     ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
@@ -58,8 +58,8 @@ body: |
     ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
     ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
-    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
-    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]]
     ;
     ; GFX11-FAKE16-LABEL: name: fcmp_true_f16
     ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
index 4241f94..ed811d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
@@ -20,8 +20,8 @@ body: |
     ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
     ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
-    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
-    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]]
     ;
     ; GFX11-FAKE16-LABEL: name: fcmp_false_f16
     ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
@@ -58,8 +58,8 @@ body: |
     ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
     ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
-    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
-    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]]
     ;
     ; GFX11-FAKE16-LABEL: name: fcmp_true_f16
     ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index eeb7b13..fe002d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -18,9 +18,9 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_exp_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_not_b32_e32 v1, 63
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -75,53 +75,80 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 }
 
 define float @v_powi_f32(float %l, i32 %r) {
-; GFX78-LABEL: v_powi_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX78-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_exp_f32_e32 v0, v0
+; GFX7-NEXT:    v_not_b32_e32 v1, 63
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    v_log_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 %r)
   ret float %res
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
index 5378ce2..10517a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
@@ -491,3 +491,132 @@ body: |
     %1:_(p5) = G_DYN_STACKALLOC %0, 32
     S_ENDPGM 0, implicit %1
 ...
+
+---
+name: test_dyn_stackalloc_vgpr_align4
+legalized:       true
+frameInfo:
+  maxAlignment: 4
+stack:
+  - { id: 0, type: variable-sized, alignment: 4 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align4
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align4
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 4
+    S_ENDPGM 0, implicit %1
+...
+
+---
+name: test_dyn_stackalloc_vgpr_align16
+legalized:       true
+frameInfo:
+  maxAlignment: 16
+stack:
+  - { id: 0, type: variable-sized, alignment: 16 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align16
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align16
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 16
+    S_ENDPGM 0, implicit %1
+...
+
+---
+name: test_dyn_stackalloc_vgpr_align64
+legalized:       true
+frameInfo:
+  maxAlignment: 64
+stack:
+  - { id: 0, type: variable-sized, alignment: 64 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align64
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
+    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align64
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 64
+    S_ENDPGM 0, implicit %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
new file mode 100644
index 0000000..52259c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Reduce a 64-bit add by a constant if we know the low 32-bits are all
+; zero.
+
+; add i64:x, K if computeTrailingZeros(K) >= 32
+; => build_pair (add x.hi, K.hi), x.lo
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x40000
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 2
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x80000000
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x40000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_high_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, -1
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_high_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %add
+}
+
+define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, 1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %add
+}
+
+define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, 2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %add
+}
+
+define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %add
+}
+
+define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, 2
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %add
+}
+
+; We could reduce this to use a 32-bit add if we use computeKnownBits
+define i64 @v_add_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) {
+; GFX9-LABEL: v_add_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %add = add i64 %reg, %in.high.bits
+  ret i64 %add
+}
+
+; We could reduce this to use a 32-bit add if we use computeKnownBits
+define amdgpu_ps i64 @s_add_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) {
+; GFX9-LABEL: s_add_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, 0
+; GFX9-NEXT:    s_addc_u32 s1, s1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %add = add i64 %reg, %in.high.bits
+  ret i64 %add
+}
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 5b72795..b128be2 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 
 define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX7-LABEL: fmul_select_f32_test1:
@@ -21,22 +25,22 @@ define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 2.000000e+00, float 1.000000e+00
   %ldexp = fmul float %x, %y
@@ -60,22 +64,22 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 5.000000e-01, float 1.000000e+00
   %ldexp = fmul float %x, %y
@@ -83,49 +87,71 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f32_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f32_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_v2f32_test3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f32_test3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
   %ldexp = fmul <2 x float> %x, %y
@@ -133,49 +159,71 @@ define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1
 }
 
 define <2 x float> @fmul_select_v2f32_test4(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f32_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f32_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_v2f32_test4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f32_test4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x float> <float 5.000000e-01, float 5.000000e-01>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
   %ldexp = fmul <2 x float> %x, %y
@@ -199,22 +247,22 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -2.000000e+00, float -1.000000e+00
   %ldexp = fmul float %x, %y
@@ -222,44 +270,83 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define float @fmul_select_f32_test6(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc0400000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc0400000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xc0400000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc0400000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -3.000000e+00, float 8.000000e+00
   %ldexp = fmul float %x, %y
@@ -285,22 +372,22 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0x43A0000000000000, float 0x45B0000000000000
   %ldexp = fmul float %x, %y
@@ -308,44 +395,83 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
 }
 
 define float @fmul_select_f32_test8(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc1000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0x41800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc1000000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test8:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test8:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test8:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test8:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test8:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test8:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 1.600000e+01, float -8.000000e+00
   %ldexp = fmul float %x, %y
@@ -369,22 +495,22 @@ define float @fmul_select_f32_test9(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test9:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test9:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0.000000e+00, float 2.000000e+00
   %ldexp = fmul float %x, %y
@@ -410,22 +536,22 @@ define float @fmul_select_f32_test10(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test10:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test10:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test10:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test10:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -0.000000e+00, float 0.000000e+00
   %ldexp = fmul float %x, %y
@@ -451,22 +577,22 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
 ; GFX9-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, -v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f32 v0, -v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0xC4D0000000000000, float 0xC370000000000000
   %ldexp = fmul float %x, %y
@@ -474,44 +600,83 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
 }
 
 define float @fmul_select_f32_test12_sel_log2val_neg48_pos68(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x44
-; GFX7-NEXT:    v_not_b32_e32 v4, 47
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x44
-; GFX9-NEXT:    v_not_b32_e32 v4, 47
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_not_b32_e32 v3, 47
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_not_b32_e32 v3, 47
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 47
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v3, 47
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x44
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 47
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v3, 47
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x44
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_not_b32_e32 v3, 47
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_not_b32_e32 v3, 47
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0x3CF0000000000000, float 0x4430000000000000
   %ldexp = fmul float %x, %y
@@ -535,22 +700,22 @@ define double @fmul_select_f64_test1(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 2.000000e+00, double 1.000000e+00
   %ldexp = fmul double %x, %y
@@ -574,22 +739,22 @@ define double @fmul_select_f64_test2(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 5.000000e-01, double 1.000000e+00
   %ldexp = fmul double %x, %y
@@ -619,28 +784,28 @@ define <2 x double> @fmul_select_v2f64_test3(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -670,28 +835,28 @@ define <2 x double> @fmul_select_v2f64_test4(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 5.000000e-01>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -715,22 +880,22 @@ define double @fmul_select_f64_test5(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -5.000000e-01, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -754,22 +919,22 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test6:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test6:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -2.000000e+00, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -777,44 +942,64 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test7(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbff00000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbff00000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_f64_test7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 2.000000e+00, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -838,22 +1023,22 @@ define double @fmul_select_f64_test8(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -4.000000e+00, double -3.200000e+01
   %ldexp = fmul double %x, %y
@@ -883,28 +1068,28 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test9:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test9:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double -2.000000e+00, double -2.000000e+00>, <2 x double> <double -1.000000e+00, double -1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -912,60 +1097,115 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
 }
 
 define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f64_test10:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, 0xbff00000
-; GFX7-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v8, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX7-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f64_test10:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xbff00000
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f64_test10:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test10:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v9, 0xbff00000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v9, 0xbff00000 :: v_dual_mov_b32 v8, 0
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 2.000000e+00>, <2 x double> <double -1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -973,44 +1213,64 @@ define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.a
 }
 
 define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test11:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test11:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test11:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test11:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test11:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test11:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test11:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_f64_test11:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test11:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -2.000000e+00, double -0.000000e+00
   %ldexp = fmul double %x, %y
@@ -1018,45 +1278,84 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test12:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test12:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 31, v3
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test12:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test12:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test12:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test12:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test12:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test12:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v3
+; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test12:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test12:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
+; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test12:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0.000000e+00, double -0.000000e+00
   %ldexp = fmul double %x, %y
@@ -1084,24 +1383,24 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test13:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test13:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test13:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test13:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0.000000e+00, double 1.600000e+01
   %ldexp = fmul double %x, %y
@@ -1109,44 +1408,83 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_not_b32_e32 v4, 26
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x5c
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v4, 26
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x5c
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0x5c
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0x5c
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 26
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v5, 26
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 26
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v5, 26
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_not_b32_e32 v4, 26
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_not_b32_e32 v4, 26
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0x45B0000000000000, double 0x3E40000000000000
   %ldexp = fmul double %x, %y
@@ -1154,44 +1492,83 @@ define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bo
 }
 
 define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_not_b32_e32 v4, 32
-; GFX7-NEXT:    v_not_b32_e32 v5, 41
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v4, 32
-; GFX9-NEXT:    v_not_b32_e32 v5, 41
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_not_b32_e32 v4, 41
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_not_b32_e32 v4, 41
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 32
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v5, 41
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v4, 41
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v5, 32
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 32
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v5, 41
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v4, 41
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v5, 32
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_not_b32_e32 v4, 41
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_not_b32_e32 v4, 32
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_not_b32_e32 v4, 41
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_not_b32_e32 v4, 32
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0x3D50000000000000, double 0x3DE0000000000000
   %ldexp = fmul double %x, %y
@@ -1200,40 +1577,82 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo
 
 
 define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 2.000000e+00, half 1.000000e+00
   %ldexp = fmul half %x, %y
@@ -1241,47 +1660,89 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test2:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test2:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test2:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test2:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test2:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test2:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 5.000000e-01, half 1.000000e+00
   %ldexp = fmul half %x, %y
@@ -1289,59 +1750,126 @@ define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f16_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f16_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f16_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1030-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f16_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x half> <half 2.000000e+00, half 2.000000e+00>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
   %ldexp = fmul <2 x half> %x, %y
@@ -1349,59 +1877,126 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 }
 
 define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f16_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f16_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f16_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x3800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1030-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f16_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x half> <half 5.000000e-01, half 5.000000e-01>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
   %ldexp = fmul <2 x half> %x, %y
@@ -1409,15 +2004,25 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 }
 
 define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test5:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmul_select_f16_test5:
 ; GFX9:       ; %bb.0:
@@ -1427,22 +2032,22 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f16_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f16_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f16_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 2.000000e+00, half 8.000000e+00
   %ldexp = fmul half %x, %y
@@ -1450,46 +2055,88 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4200
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xc800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4200
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4200
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -8.000000e+00, half 3.000000e+00
   %ldexp = fmul half %x, %y
@@ -1497,45 +2144,87 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc400
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x4800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc400
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc400
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test7:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test7:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test7:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test7:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 8.000000e+00, half -4.000000e+00
   %ldexp = fmul half %x, %y
@@ -1543,16 +2232,28 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmul_select_f16_test8:
 ; GFX9:       ; %bb.0:
@@ -1563,22 +2264,22 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f16_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f16_test8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f16_test8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -0.000000e+00, half 0.000000e+00
   %ldexp = fmul half %x, %y
@@ -1586,40 +2287,87 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test9:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test9:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test9:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 5, v1
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test9:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test9:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test9:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test9:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test9:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test9:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -1.600000e+01, half -3.200000e+01
   %ldexp = fmul half %x, %y
@@ -1627,47 +2375,82 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 0xH1000, half 0xH6800
   %ldexp = fmul half %x, %y
@@ -1675,47 +2458,82 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a
 }
 
 define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 0xH5800, half 0xH0400
   %ldexp = fmul half %x, %y
@@ -1723,72 +2541,114 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar
 }
 
 define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -1796,72 +2656,114 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f00
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x3f00
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3f00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -1869,111 +2771,158 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2bf16_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2bf16_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2bf16_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1030-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1030-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1030-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1030-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1030-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1030-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1030-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2bf16_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -1981,111 +2930,158 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2bf16_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2bf16_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3f00
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2bf16_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1030-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1030-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1030-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1030-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1030-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1030-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1030-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2bf16_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3f00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -2093,73 +3089,108 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 }
 
 define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test5:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2167,74 +3198,116 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffc100
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc100
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc100
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2242,73 +3315,115 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc080
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc080
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4100
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2316,73 +3431,111 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-SDAG-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2390,74 +3543,121 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test9:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc2000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc200
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc180
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffc180
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 5, v1
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc200
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc180
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc180
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01
   %ldexp = fmul bfloat %x, %y
@@ -2465,74 +3665,111 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xdb800000
-; GFX7-NEXT:    v_bfrev_b32_e32 v4, 7
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffe000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffe000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xdb800000
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v4, 7
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffe000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80
   %ldexp = fmul bfloat %x, %y
@@ -2540,74 +3777,111 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 }
 
 define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_bfrev_b32_e32 v3, 50
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0x34800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4c00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3480
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x3480
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v3, 50
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0x34800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v3, 21
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3480
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v3, 21
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3480
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00
   %ldexp = fmul bfloat %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir
new file mode 100644
index 0000000..5c7c076
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=default -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=dummy -o - %s | FileCheck -check-prefixes=CHECK,DUMMY %s
+
+# Check that the regalloc-enable-priority-advisor=dummy option works
+# and the result is different from the default. Ordinarily %1 would be
+# prioritized higher than %0 due to the register class priority
+
+---
+name:            foo
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; DEFAULT-LABEL: name: foo
+    ; DEFAULT: liveins: $vgpr0, $vgpr1
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+    ; DEFAULT-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; DEFAULT-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; DEFAULT-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    ; DEFAULT-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; DEFAULT-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec
+    ; DEFAULT-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; DUMMY-LABEL: name: foo
+    ; DUMMY: liveins: $vgpr0, $vgpr1
+    ; DUMMY-NEXT: {{  $}}
+    ; DUMMY-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+    ; DUMMY-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; DUMMY-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; DUMMY-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; DUMMY-NEXT: renamable $vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+    ; DUMMY-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
+    ; DUMMY-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+    undef %1.sub0:vreg_128 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    %2:vgpr_32 = V_ADD_U32_e32 %1.sub0, %0, implicit $exec
+    $vgpr3 = COPY %2
+    SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+
+...
+
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 73aa87e..9acb3a4 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -1,64 +1,829 @@
-; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-GISEL %s
 target datalayout = "A5"
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s4, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
   store volatile i32 10, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
   store volatile i32 22, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca float, i32 %idx, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 128, addrspace(5)
   store volatile i32 444, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 4, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX11-SDAG-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 4, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i128, i32 %idx, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s9, s5
+; GFX9-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s5, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s5
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s5
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB6_4: ; %bb.1
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s5, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB6_4: ; %bb.1
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s4, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, s32
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s3, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s1, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s4, s1, -16
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s32
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s2, 5, s3
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s3 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:  .LBB6_4: ; %bb.1
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_add_u32 s3, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_lshl_b32 s4, s1, 5
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s4
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s2, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s3 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s3, s2
+; GFX11-GISEL-NEXT:  .LBB6_4: ; %bb.1
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %n, 0
   %alloca1 = alloca i32, i32 8, addrspace(5)
@@ -77,10 +842,206 @@ bb.1:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x1000
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s4, s4, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s4, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
+; GFX9-SDAG-NEXT:  .LBB7_4: ; %bb.0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s4, s5
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB7_5: ; %bb.2
+; GFX9-SDAG-NEXT:    s_endpgm
+; GFX9-SDAG-NEXT:  .LBB7_6:
+; GFX9-SDAG-NEXT:    s_branch .LBB7_4
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, 1
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x1000
+; GFX9-GISEL-NEXT:    s_cbranch_scc0 .LBB7_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s9, v0, s4
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s4
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s9
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX9-GISEL-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_6
+; GFX9-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s5, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB7_6: ; %bb.2
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 64
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
+; GFX11-SDAG-NEXT:  .LBB7_4: ; %bb.0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:  .LBB7_5: ; %bb.2
+; GFX11-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-NEXT:  .LBB7_6:
+; GFX11-SDAG-NEXT:    s_branch .LBB7_4
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 64
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 1
+; GFX11-GISEL-NEXT:    s_cbranch_scc0 .LBB7_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s0, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s2, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s3, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s3 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX11-GISEL-NEXT:    s_xor_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_6
+; GFX11-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s1, 15
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff800
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:  .LBB7_6: ; %bb.2
+; GFX11-GISEL-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %n, 0
   br i1 %cond, label %bb.0, label %bb.1
@@ -97,62 +1058,1113 @@ bb.2:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 10
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
   store volatile i32 10, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 22
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
   store volatile i32 22, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 128, addrspace(5)
   store volatile i32 444, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s13, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x3000
+; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_6
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s11, v1, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s11, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s12, v1, s11
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s11
+; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s12
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX9-SDAG-NEXT:  ; %bb.5:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB14_6: ; %bb.1
+; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s6
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s7
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX9-SDAG-NEXT:  ; %bb.8:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xd000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s13
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s13, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x3000
+; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB14_6
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, 0
+; GFX9-GISEL-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s11, v1, s10
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9-GISEL-NEXT:    s_max_u32 s9, s9, s11
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_add_u32 s7, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s9, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s9, s7, 0xfffff000
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v2, 2, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s9, s6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, 0
+; GFX9-GISEL-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s11, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s12, v1, s11
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s11
+; GFX9-GISEL-NEXT:    s_max_u32 s10, s10, s12
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX9-GISEL-NEXT:  ; %bb.5:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s10, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB14_6: ; %bb.1
+; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s6
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s7
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX9-GISEL-NEXT:  ; %bb.8:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xd000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s13
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_mov_b32 s7, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xc0
+; GFX11-SDAG-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_6
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v1, s4
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s5
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s3, 5, s2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX11-SDAG-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s4, s5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s6
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX11-SDAG-NEXT:  ; %bb.5:
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s3, 5, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s4 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:  .LBB14_6: ; %bb.1
+; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX11-SDAG-NEXT:  ; %bb.8:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff40
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s7, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xc0
+; GFX11-GISEL-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB14_6
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v2, v1, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, -16, v2
+; GFX11-GISEL-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s5, v2, s4
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-GISEL-NEXT:    s_lshl_b32 s5, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s5
+; GFX11-GISEL-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s4, s5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s3, s3, s6
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX11-GISEL-NEXT:  ; %bb.5:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s3, s3, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s4 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s4, s3
+; GFX11-GISEL-NEXT:  .LBB14_6: ; %bb.1
+; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX11-GISEL-NEXT:  ; %bb.8:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s7
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff40
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
   %alloca1 = alloca i32, i32 8, addrspace(5)
@@ -171,10 +2183,272 @@ bb.1:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v1, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v2, s8, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    ; implicit-def: $vgpr31
+; GFX9-SDAG-NEXT:  .LBB15_4: ; %Flow
+; GFX9-SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_8
+; GFX9-SDAG-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX9-SDAG-NEXT:  ; %bb.7:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB15_8: ; %bb.2
+; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xe000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s11
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s11, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_add_u32 s7, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s7, s7, 0xfffff000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s7, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    ; implicit-def: $vgpr31
+; GFX9-GISEL-NEXT:  .LBB15_4: ; %Flow
+; GFX9-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_8
+; GFX9-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX9-GISEL-NEXT:  ; %bb.7:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB15_8: ; %bb.2
+; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xe000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s11
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s1, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:  .LBB15_4: ; %Flow
+; GFX11-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_8
+; GFX11-SDAG-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX11-SDAG-NEXT:  ; %bb.7:
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:  .LBB15_8: ; %bb.2
+; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff80
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GFX11-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    ; implicit-def: $vgpr31
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB15_4: ; %Flow
+; GFX11-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_8
+; GFX11-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX11-GISEL-NEXT:  ; %bb.7:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB15_8: ; %bb.2
+; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff80
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s5
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
   br i1 %cond, label %bb.0, label %bb.1
@@ -190,3 +2464,257 @@ bb.1:
 bb.2:
   ret void
 }
+
+define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
+; GFX11-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, i16 %n, align 2, addrspace(5)
+  store volatile i32 666, ptr addrspace(5) %alloca
+  ret void
+}
+
+define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, i64 %n, align 2, addrspace(5)
+  store volatile i32 666, ptr addrspace(5) %alloca
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index ebfb5e9..a324ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1625,14 +1625,12 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
 ; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rsq_f32_e32 v0, v0
-; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x45800000
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 97d642b..5415af0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -5249,6 +5249,114 @@ bb:
   ret void
 }
 
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add3_u32 v0, s2, s3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9-PAL:       ; %bb.0: ; %bb
+; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX940:       ; %bb.0: ; %bb
+; GFX940-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10-PAL:       ; %bb.0: ; %bb
+; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11-PAL:       ; %bb.0: ; %bb
+; GFX11-PAL-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    s_endpgm
+bb:
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, -16
+  %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  ret void
+}
+
 define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
 ; GFX9-LABEL: sgpr_base_negative_offset:
 ; GFX9:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 005e401..822d40f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
 
 declare half @llvm.fma.f16(half, half, half)
 declare half @llvm.maxnum.f16(half, half)
@@ -27,6 +29,16 @@ define half @test_fma(half %x, half %y, half %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fma:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half %z)
   ret half %r
 }
@@ -50,6 +62,16 @@ define half @test_fmac(half %x, half %y, half %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmac_f16_e32 v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmac:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fmac_f16_e32 v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %y, half %z, half %x)
   ret half %r
 }
@@ -81,6 +103,16 @@ define half @test_fmaak(half %x, half %y, half %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmaak:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
   ret half %r
 }
@@ -112,6 +144,16 @@ define half @test_fmamk(half %x, half %y, half %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmamk:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
   ret half %r
 }
@@ -193,6 +235,42 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: test_D139469_f16:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0x211e
+; GFX12-SDAG-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
+; GFX12-SDAG-NEXT:    v_min_num_f16_e32 v0, v2, v1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_D139469_f16:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e
+; GFX12-GISEL-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v1
+; GFX12-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract half %arg, 0xH291E
   %i1 = fcmp olt half %i, 0xH0000
@@ -306,6 +384,55 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: test_D139469_v2f16:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0x211e
+; GFX12-SDAG-NEXT:    v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v1, v0
+; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_D139469_v2f16:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
+; GFX12-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e64 s1, 0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e64 s2, 0, v3
+; GFX12-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_or_b32 s0, s1, s2
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
   %i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000>
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0db..fbcdbed 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile  float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_max_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e..269fd52 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_min_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s6
+; GFX12-NEXT:    s_mov_b32 s13, s7
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
   %c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s6
+; GFX12-NEXT:    s_mov_b32 s13, s7
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
   %c = load volatile double, ptr addrspace(1) %cptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 104e157..9ae60f9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -3307,489 +3307,459 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) {
 ; --------------------------------------------------------------------
 
 define float @v_mul_f32_select_64_1(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f32_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float 64.0, float 1.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float 1.0, float 64.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float -1.0, float -64.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n64_n1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_n64_n1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 64.0, float 1.0
+  %select.pow2 = select i1 %cond, float -64.0, float -1.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_1_64:
+define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
+; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 1.0, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_1_64:
+; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_1_64:
+; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 1.0, float 64.0
+  %select.pow2 = select i1 %cond, float 128.0, float 64.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n1_n64:
+define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
+; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1.0, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n1_n64:
+; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n1_n64:
+; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -1.0, float -64.0
+  %select.pow2 = select i1 %cond, float -128.0, float -64.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n64_n1:
+define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_mul_f32_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float -128.0, float -16.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1.0, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n64_n1:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n64_n1:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -64.0, float -1.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
+define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, 1.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x43000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 128.0, float 64.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
+define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc3000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -128.0, float -64.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n16:
+define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, -1.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc3000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n16:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n16:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -128.0, float -16.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
-}
-
-define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_64_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_64_1:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_1_64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, 1.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_1_64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_n64_n1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc2800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, -1.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_n64_n1:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_n1_n64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc2800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -1.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_n1_n64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, float -1.0, float -64.0
   %mul = fmul contract float %x, %select.pow2
   %fma = fadd contract float %mul, %y
@@ -3810,11 +3780,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
@@ -3829,10 +3799,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
@@ -3847,10 +3818,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 64.0
@@ -3860,22 +3832,57 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 }
 
 define float @v_contract_mul_add_f32_select_128_4(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_128_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 4.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, 4.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_128_4:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 4.0
   %mul = fmul contract float %x, %select.pow2
@@ -3907,143 +3914,102 @@ define float @v_contract_mul_add_f32_select_2_4(i32 %arg, float %x, float %y) {
 }
 
 define float @v_contract_mul_add_f32_select_4_128(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_4_128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, 4.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_4_128:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 4.0, float 128.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, 4.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x3ff00000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float 4.0, float 128.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
+}
+
+define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
+; GFX9-LABEL: v_mul_f64_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_mul_f64_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 64.0, double 1.0
   %mul = fmul double %x, %select.pow2
   ret double %mul
 }
 
 define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40500000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 1.0, double 64.0
   %mul = fmul double %x, %select.pow2
@@ -4051,59 +4017,21 @@ define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
 }
 
 define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0500000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0500000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -1.0, double -64.0
   %mul = fmul double %x, %select.pow2
@@ -4122,12 +4050,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX9-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_128_64:
@@ -4141,11 +4067,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX10-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_128_64:
@@ -4159,10 +4084,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX11-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4182,12 +4107,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n64:
@@ -4201,11 +4124,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n64:
@@ -4219,10 +4141,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -128.0, double -64.0
@@ -4231,59 +4153,21 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 }
 
 define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0300000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0300000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0300000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -128.0, double -16.0
   %mul = fmul double %x, %select.pow2
@@ -4305,12 +4189,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
@@ -4326,11 +4208,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
@@ -4345,10 +4226,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x3ff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double 64.0, double 1.0
@@ -4372,12 +4253,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
@@ -4393,11 +4272,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
@@ -4412,10 +4290,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double 1.0, double 64.0
@@ -4439,12 +4317,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xc0500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0xbff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
@@ -4460,11 +4336,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
@@ -4479,10 +4354,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0xbff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double -64.0, double -1.0
@@ -4506,12 +4381,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
@@ -4527,11 +4400,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
@@ -4546,10 +4418,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0xc0500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double -1.0, double -64.0
@@ -4573,12 +4445,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
@@ -4594,11 +4465,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
@@ -4613,10 +4484,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4640,12 +4512,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40100000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
@@ -4661,11 +4531,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
@@ -4680,10 +4549,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40100000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 4.0
@@ -4706,21 +4575,50 @@ define double @v_contract_mul_add_f64_select_2_4(i32 %arg, double %x, double %y)
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 2.0, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_contract_mul_add_f64_select_2_4:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 2.0, double 4.0
   %mul = fmul contract double %x, %select.pow2
@@ -4743,12 +4641,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40600000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
@@ -4764,11 +4660,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
@@ -4783,10 +4678,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40600000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 4.0, double 128.0
@@ -4796,57 +4691,21 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 }
 
 define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 64.0, half 1.0
   %mul = fmul half %x, %select.pow2
@@ -4854,57 +4713,21 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 1.0, half 64.0
   %mul = fmul half %x, %select.pow2
@@ -4912,57 +4735,21 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -1.0, half -64.0
   %mul = fmul half %x, %select.pow2
@@ -4981,11 +4768,13 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX9-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_128_64:
@@ -4999,10 +4788,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX10-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_128_64:
@@ -5016,10 +4807,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5039,11 +4832,13 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n64:
@@ -5057,10 +4852,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n64:
@@ -5074,10 +4871,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -64.0
@@ -5086,57 +4885,21 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -16.0
   %mul = fmul half %x, %select.pow2
@@ -5157,11 +4920,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
@@ -5176,10 +4938,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
@@ -5194,10 +4956,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 64.0, half 1.0
@@ -5220,11 +4982,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
@@ -5239,10 +5000,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
@@ -5257,10 +5018,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 1.0, half 64.0
@@ -5283,11 +5044,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
@@ -5302,10 +5062,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
@@ -5320,10 +5080,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -64.0, half -1.0
@@ -5346,11 +5106,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
@@ -5365,10 +5124,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
@@ -5383,10 +5142,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -1.0, half -64.0
@@ -5409,11 +5168,14 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
@@ -5428,10 +5190,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
@@ -5446,10 +5211,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5472,11 +5240,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
@@ -5491,10 +5258,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
@@ -5509,10 +5276,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 4.0
@@ -5535,11 +5302,14 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
@@ -5554,10 +5324,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
@@ -5572,10 +5345,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 2.0, half 4.0
@@ -5598,11 +5374,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
@@ -5617,10 +5392,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
@@ -5635,10 +5410,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 4.0, half 128.0
@@ -5664,15 +5439,13 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_64_1:
@@ -5690,14 +5463,14 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_64_1:
@@ -5715,14 +5488,15 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -5747,15 +5521,13 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_1_64:
@@ -5773,14 +5545,14 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_1_64:
@@ -5798,14 +5570,15 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -5830,15 +5603,14 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
@@ -5856,14 +5628,15 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
@@ -5881,14 +5654,16 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -5913,15 +5688,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_128_64:
@@ -5939,14 +5718,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_128_64:
@@ -5964,14 +5748,20 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -5996,15 +5786,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
@@ -6022,14 +5817,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
@@ -6047,14 +5848,21 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -64.0, half -64.0>
@@ -6079,15 +5887,14 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xcc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
@@ -6105,14 +5912,15 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
@@ -6130,14 +5938,16 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -16.0, half -16.0>
@@ -6162,15 +5972,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
@@ -6188,14 +5997,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
@@ -6213,14 +6023,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -6246,15 +6058,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
@@ -6272,14 +6083,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
@@ -6297,14 +6109,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -6330,15 +6144,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
@@ -6356,14 +6170,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
@@ -6381,14 +6197,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -64.0, half -64.0>, <2 x half> <half -1.0, half -1.0>
@@ -6414,15 +6233,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
@@ -6440,14 +6259,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
@@ -6465,14 +6286,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -6498,15 +6322,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v4, v5
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v4, v5
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
@@ -6524,14 +6353,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
@@ -6549,14 +6384,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v5, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -6582,15 +6423,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
@@ -6608,14 +6448,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
@@ -6633,14 +6474,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 4.0, half 4.0>
@@ -6666,15 +6509,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 2, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v4, v5
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v4, v5
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
@@ -6692,14 +6540,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 2, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
@@ -6717,14 +6571,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 2, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v5, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 2.0, half 2.0>, <2 x half> <half 4.0, half 4.0>
@@ -6750,15 +6610,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5800
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
@@ -6776,14 +6635,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
@@ -6801,14 +6661,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 4.0, half 4.0>, <2 x half> <half 128.0, half 128.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index f6ee007..80b4d64 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -14,6 +14,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
 
 ; Test patterns to match v_fract_* instructions.
 
@@ -103,6 +104,21 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v3, v0
+; GFX12-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX12-NEXT:    v_floor_f32_e32 v4, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -181,6 +197,18 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f32_noinf_check:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v3, v0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    global_store_b32 v[1:2], v3, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -263,6 +291,22 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w
 ; GFX11-NEXT:    v_min_f32_e32 v4, 0x3f7fffff, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_nan_check_math_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v3, v0
+; GFX12-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v4, v0, v3
+; GFX12-NEXT:    global_store_b32 v[1:2], v3, off
+; GFX12-NEXT:    v_min_num_f32_e32 v4, 0x3f7fffff, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -314,6 +358,16 @@ define float @basic_fract_f32_nonans(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_nonans:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -362,6 +416,19 @@ define float @basic_fract_f32_flags_minnum(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_flags_minnum:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -409,6 +476,16 @@ define float @basic_fract_f32_flags_fsub(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_flags_fsub:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub nsz float %x, %floor
@@ -467,6 +544,17 @@ define <2 x float> @basic_fract_v2f32_nonans(<2 x float> nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    v_fract_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_v2f32_nonans:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    v_fract_f32_e32 v1, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
   %sub = fsub <2 x float> %x, %floor
@@ -540,6 +628,20 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_multi_use_fsub_nonans:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v3, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v3, v0, v3
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    global_store_b32 v[1:2], v3, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -588,6 +690,16 @@ define float @nnan_minnum_fract_f32(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_minnum_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -638,6 +750,19 @@ define float @nnan_fsub_fract_f32(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_fsub_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub nnan float %x, %floor
@@ -686,6 +811,19 @@ define float @nnan_floor_fract_f32(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_floor_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call nnan float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -733,6 +871,16 @@ define float @nnan_src_fract_f32(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_src_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -782,6 +930,19 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_wrong_const:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7ffffe, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -831,6 +992,19 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_swapped_fsub:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %floor, %x
@@ -880,6 +1054,19 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_not_floor:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.trunc.f32(float %x)
   %sub = fsub float %x, %floor
@@ -929,6 +1116,19 @@ define float @not_fract_f32_different_floor(float %x, float %y) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_different_floor:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %y)
   %sub = fsub float %x, %floor
@@ -978,6 +1178,19 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_maxnum:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_max_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1000,6 +1213,15 @@ define float @fcmp_uno_check_is_nan_f32(float %x) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_uno_check_is_nan_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1054,6 +1276,16 @@ define float @select_nan_fract_f32(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1107,6 +1339,16 @@ define float @commuted_select_nan_fract_f32(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: commuted_select_nan_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1168,6 +1410,22 @@ define float @wrong_commuted_nan_select_f32(float %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: wrong_commuted_nan_select_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v1, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v1, 0x3f7fffff, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1231,6 +1489,16 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f16_nonan:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -1313,6 +1581,20 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_fract_f16_e32 v1, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_v2f16_nonan:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_fract_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
   %sub = fsub <2 x half> %x, %floor
@@ -1369,6 +1651,16 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f64_nanans:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
   %sub = fsub double %x, %floor
@@ -1461,6 +1753,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture
 ; GFX11-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX11-NEXT:    global_store_b16 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f16_noinf_check:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f16_e32 v3, v0
+; GFX12-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-NEXT:    global_store_b16 v[1:2], v3, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -1546,6 +1850,18 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap
 ; GFX11-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f64_noinf_check:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f64_e32 v[4:5], v[0:1]
+; GFX12-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT:    global_store_b64 v[2:3], v[4:5], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
   %sub = fsub double %x, %floor
@@ -1600,6 +1916,16 @@ define float @select_nan_fract_f32_flags_select(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32_flags_select:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1653,6 +1979,16 @@ define float @select_nan_fract_f32_flags_minnum(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32_flags_minnum:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1769,6 +2105,25 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v7, 0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v6, v0
+; GFX12-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x204
+; GFX12-NEXT:    v_fract_f32_e32 v7, v1
+; GFX12-NEXT:    v_floor_f32_e32 v4, v0
+; GFX12-NEXT:    v_floor_f32_e32 v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX12-NEXT:    v_cmp_class_f32_e64 s0, v1, 0x204
+; GFX12-NEXT:    global_store_b64 v[2:3], v[4:5], off
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, v7, 0, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
   %sub = fsub <2 x float> %x, %floor
@@ -1881,6 +2236,21 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[6:7], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX12-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
+; GFX12-NEXT:    global_store_b64 v[2:3], v[6:7], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
   %sub = fsub double %x, %floor
@@ -2002,6 +2372,21 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b16 v[1:2], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f16_e32 v3, v0
+; GFX12-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
+; GFX12-NEXT:    v_floor_f16_e32 v4, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-NEXT:    global_store_b16 v[1:2], v4, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -2168,6 +2553,29 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    v_fract_f16_e32 v6, v0
+; GFX12-NEXT:    v_floor_f16_e32 v5, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_fract_f16_e32 v4, v3
+; GFX12-NEXT:    v_cmp_class_f16_e64 s0, v3, 0x204
+; GFX12-NEXT:    v_floor_f16_e32 v7, v3
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v4, 0, s0
+; GFX12-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pack_b32_f16 v4, v5, v7
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX12-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
   %sub = fsub <2 x half> %x, %floor
@@ -2311,6 +2719,26 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
+; GFX12-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 0x204
+; GFX12-NEXT:    v_fract_f64_e32 v[12:13], v[2:3]
+; GFX12-NEXT:    v_cmp_class_f64_e64 s1, v[2:3], 0x204
+; GFX12-NEXT:    v_floor_f64_e32 v[8:9], v[2:3]
+; GFX12-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v10, 0, s0
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, v11, 0, s0
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s1
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
   %sub = fsub <2 x double> %x, %floor
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index b300181..c1d5b58 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -2380,14 +2380,12 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x45800000
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
@@ -2734,20 +2732,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x4b800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 24, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, s[4:5]
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x45800000
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, s[4:5]
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
index cefd240..85c6577 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
@@ -18,7 +18,7 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
-    ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = COPY %0.sub0
@@ -43,7 +43,7 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
-    ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = COPY %0.sub0
@@ -68,7 +68,7 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
-    ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub0
     %2 = COPY %0.sub1
@@ -90,7 +90,7 @@ body:             |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
-    ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
     %0:vgpr_32 = COPY killed $vgpr0
 
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 157f91c..b2f113f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1)
 define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_add_i32 s3, s3, 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_add_i32 s3, s3, 1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_add_i32 s3, s3, 1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, 1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[2:3], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
@@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
 define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_add_i32 s3, s3, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_add_i32 s3, s3, -1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_add_i32 s3, s3, -1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_add_co_i32 s3, s3, -1
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[2:3], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
new file mode 100644
index 0000000..503f27e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -0,0 +1,204 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -simplify-mir -start-before=greedy,2 -stress-regalloc=4 -stop-before=virtregrewriter,2 -o - -verify-regalloc %s 2> %t.err | FileCheck %s
+# RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+# To allocate the vreg_512_align2, the allocation will attempt to
+# inflate the register class to av_512_align2. This will ultimately
+# not work, and the allocation will fail. There is an unproductive
+# live range split, and we end up with a snippet copy of an
+# unspillable register. Recursive assignment of interfering ranges
+# during last chance recoloring would delete the unspillable snippet
+# live range. Make sure there's no use after free when rolling back
+# the last chance assignment.
+
+# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free'
+# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free_lane_subset'
+
+--- |
+  define amdgpu_kernel void @inflated_reg_class_copy_use_after_free() {
+    ret void
+  }
+
+  define amdgpu_kernel void @inflated_reg_class_copy_use_after_free_lane_subset() {
+    ret void
+  }
+
+...
+
+# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free
+# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: early-clobber [[MFMA0:%[0-9]+]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[MFMA0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY [[MFMA0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1
+# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3
+# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 {
+# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
+# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[MFMA_USE1]], 0, 0, 0, implicit $mode, implicit $exec
+
+---
+name:            inflated_reg_class_copy_use_after_free
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr72_sgpr73_sgpr74_sgpr75'
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       7
+  vgprForAGPRCopy: '$vgpr255'
+  sgprForEXECCopy: '$sgpr74_sgpr75'
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr4_sgpr5
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
+    S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
+    early-clobber %2:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    %1.sub2:vreg_512_align2 = COPY %2.sub3
+    %1.sub3:vreg_512_align2 = COPY %2.sub2
+    %1.sub4:vreg_512_align2 = COPY %2.sub0
+    %1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
+    S_ENDPGM 0
+
+...
+
+# This test is similar to except it is still broken when the use
+# instruction does not read the full set of lanes after one attempted fix.
+
+# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free_lane_subset
+# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE_0:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: S_NOP 0, implicit-def early-clobber [[REG1:%[0-9]+]], implicit [[RESTORE_0]].sub0_sub1_sub2_sub3, implicit [[RESTORE_0]].sub4_sub5_sub6_sub7
+# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[REG1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY [[REG1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE_1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE_1]].sub0_sub1
+# CHECK-NEXT: [[RESTORE_2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE_2]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[RESTORE_2]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT5:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT4]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT5]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT3]].sub2:av_512_align2 = COPY [[SPLIT5]].sub3
+# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT3]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT5]].sub0 {
+# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT5]].sub2
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT7]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
+# CHECK-NEXT: undef [[SPLIT9:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT7]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: undef [[LAST_USE:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: [[LAST_USE]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[LAST_USE]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: S_NOP 0, implicit-def [[LAST_USE]], implicit [[LAST_USE]].sub0_sub1_sub2_sub3, implicit [[LAST_USE]].sub4_sub5_sub6_sub7, implicit [[LAST_USE]].sub8_sub9_sub10_sub11
+
+---
+name:            inflated_reg_class_copy_use_after_free_lane_subset
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr72_sgpr73_sgpr74_sgpr75'
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       7
+  vgprForAGPRCopy: '$vgpr255'
+  sgprForEXECCopy: '$sgpr74_sgpr75'
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr4_sgpr5
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
+    S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
+    S_NOP 0, implicit-def early-clobber %2:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7
+    %1.sub2:vreg_512_align2 = COPY %2.sub3
+    %1.sub3:vreg_512_align2 = COPY %2.sub2
+    %1.sub4:vreg_512_align2 = COPY %2.sub0
+    %1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    S_NOP 0, implicit-def %1:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7, implicit %1.sub8_sub9_sub10_sub11
+    GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index cf9fdbd..2ceaca3 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %12
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %10
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll b/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll
new file mode 100644
index 0000000..1a87887
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+define <4 x float> @issue121601(bfloat %fptrunc) {
+; CHECK-LABEL: issue121601:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %bitcast = bitcast bfloat %fptrunc to <1 x bfloat>
+  %shufflevector = shufflevector <1 x bfloat> %bitcast, <1 x bfloat> zeroinitializer, <2 x i32> zeroinitializer
+  %fpext = fpext <2 x bfloat> %shufflevector to <2 x float>
+  %shufflevector1 = shufflevector <2 x float> %fpext, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shufflevector1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 3ff759a..867025a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -4,6 +4,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
 
 define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX6-LABEL: cos_f16:
@@ -80,6 +81,19 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    v_cos_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: cos_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cos_f16_e32 v1, v1
+; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.cos.f16(half %a.val)
   store half %r.val, ptr addrspace(1) %r
@@ -188,6 +202,24 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: cos_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX12-NEXT:    v_cos_f16_e32 v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT:    v_cos_f16_e32 v2, v2
+; GFX12-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a.val = load <2 x half>, ptr addrspace(1) %a
   %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
   store <2 x half> %r.val, ptr addrspace(1) %r
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index ac51580..333d428 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -41,10 +41,10 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s2, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -78,9 +78,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s2, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v0, v1
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
@@ -115,9 +115,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -203,7 +203,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
@@ -213,10 +213,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v3, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -252,7 +252,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
@@ -262,10 +262,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v3, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,7 +300,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
@@ -310,10 +310,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s11, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v3, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -421,17 +421,17 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v4
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -439,11 +439,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
 ; SI-GISEL-NEXT:    s_endpgm
@@ -487,16 +487,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -504,10 +504,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v4, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
@@ -551,15 +551,15 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -567,10 +567,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -710,7 +710,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -720,22 +720,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v3, v3, v4
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -787,7 +787,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -797,22 +797,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v1, v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -863,7 +863,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -873,22 +873,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v3, v3, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -1006,19 +1006,19 @@ define float @v_exp2_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1034,6 +1034,20 @@ define float @v_exp2_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1048,6 +1062,20 @@ define float @v_exp2_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1076,19 +1104,19 @@ define float @v_exp2_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fabs_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1104,6 +1132,20 @@ define float @v_exp2_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1118,6 +1160,20 @@ define float @v_exp2_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fabs_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1147,19 +1203,19 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fneg_fabs_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1175,6 +1231,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1189,6 +1259,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fneg_fabs_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1219,19 +1303,19 @@ define float @v_exp2_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fneg_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fneg_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1247,6 +1331,20 @@ define float @v_exp2_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fneg_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1261,6 +1359,20 @@ define float @v_exp2_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fneg_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fneg_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1290,19 +1402,19 @@ define float @v_exp2_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_fast:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_fast:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_fast:
 ; VI-SDAG:       ; %bb.0:
@@ -1318,6 +1430,20 @@ define float @v_exp2_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_fast:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1332,6 +1458,20 @@ define float @v_exp2_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_fast:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1360,19 +1500,19 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1388,6 +1528,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1402,6 +1556,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_unsafe_math_attr:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1430,19 +1598,19 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1458,6 +1626,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1472,6 +1654,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_approx_fn_attr:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1500,19 +1696,19 @@ define float @v_exp2_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -1528,6 +1724,20 @@ define float @v_exp2_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1542,6 +1752,20 @@ define float @v_exp2_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1570,19 +1794,19 @@ define float @v_exp2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_afn:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -1598,6 +1822,20 @@ define float @v_exp2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1612,6 +1850,20 @@ define float @v_exp2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_afn:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1660,19 +1912,19 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_afn_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_afn_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -1688,6 +1940,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_afn_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1702,6 +1968,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_afn_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1730,19 +2010,19 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_fabs_exp2_f32_afn:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_fabs_exp2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -1758,6 +2038,20 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_fabs_exp2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1772,6 +2066,20 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_fabs_exp2_f32_afn:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1821,19 +2129,19 @@ define float @v_exp2_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
@@ -1849,6 +2157,20 @@ define float @v_exp2_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1863,6 +2185,20 @@ define float @v_exp2_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1911,19 +2247,19 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -1939,6 +2275,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1953,6 +2303,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2001,19 +2365,19 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2029,6 +2393,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2043,6 +2421,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_ninf_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2071,19 +2463,19 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -2099,6 +2491,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2113,6 +2519,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2161,19 +2581,19 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2189,6 +2609,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2203,6 +2637,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2251,19 +2699,19 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_dynamic_mode:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
@@ -2279,6 +2727,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2293,6 +2755,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_dynamic_mode:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2313,20 +2789,50 @@ define float @v_exp2_f32_undef() {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, 0x7fc00000
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_undef:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; GCN-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
-; GCN-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_undef:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; SI-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_exp2_f32_undef:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp2_f32_undef:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_undef:
 ; R600:       ; %bb.0:
@@ -3359,19 +3865,19 @@ define float @v_exp2_f32_contract(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_contract:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_contract:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_contract:
 ; VI-SDAG:       ; %bb.0:
@@ -3387,6 +3893,20 @@ define float @v_exp2_f32_contract(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_contract:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_contract:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3401,6 +3921,20 @@ define float @v_exp2_f32_contract(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_contract:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_contract:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -3449,19 +3983,19 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -3477,6 +4011,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3491,6 +4039,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -3518,3 +4080,5 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2
 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
 attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index b9fef083..88ef7a93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -3,11 +3,13 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
 
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
 
 define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i32:
@@ -50,6 +52,19 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -96,6 +111,16 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) {
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -145,6 +170,18 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -221,6 +258,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-NEXT:    v_bfe_i32 v2, v4, 0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f16_v2i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v3, v1
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v4, v1
+; GFX12-NEXT:    v_bfe_i32 v1, v0, 0, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_pack_b32_f16 v0, v2, v3
+; GFX12-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -311,6 +367,20 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,6 +456,22 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
+; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -463,6 +549,19 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -509,6 +608,16 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) {
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -554,6 +663,16 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
 ; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -623,6 +742,19 @@ define { float, i32 } @test_frexp_f32_i32(float %a) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f32_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -665,6 +797,16 @@ define float @test_frexp_f32_i32_only_use_fract(float %a) {
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f32_i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -706,6 +848,16 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f32_i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -771,6 +923,21 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f32_v2i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v5, v1
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -846,6 +1013,17 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v1, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -896,6 +1074,17 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -954,6 +1143,19 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f64_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1000,6 +1202,16 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) {
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f64_i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1044,6 +1256,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f64_i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1116,6 +1338,22 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f64_v2i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[8:9], v[0:1]
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[6:7], v[2:3]
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v5, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX12-NEXT:    v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1174,6 +1412,17 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) {
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0
   ret <2 x double> %result.0
@@ -1213,6 +1462,17 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1
   ret <2 x i32> %result.1
@@ -1235,3 +1495,5 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
 ; GCN: {{.*}}
 ; GFX11-GISEL: {{.*}}
 ; GFX11-SDAG: {{.*}}
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 218e41f..b850428 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
@@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41b17218
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x41b17218, s2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_f32:
@@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v2, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
@@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT:    v_and_b32_e32 v4, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s6, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v1, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v1, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v3, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s10, v2
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v3, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v4, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, 0x3f317217, v0 :: v_dual_mul_f32 v3, 0x3f317217, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v3, 0x3f317217, v1 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3f317217, v1, -v3
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v5, 0x3377d1cf, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v4, 0x3f317217, v0, -v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3f317217, v1, -v3
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v4, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
@@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
-; SI-GISEL-NEXT:    v_log_f32_e32 v6, v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v5, s9, v5
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v2, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v3, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v8
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v8|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
@@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v3
@@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s9, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
 ; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v3, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v2, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v6, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v6|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
@@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v2, -v6
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v6, s9, v6
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v2, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v2, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v4, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v9|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v2 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3f317217, v2, -v5
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v8, 0x3377d1cf, v2
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v5, v5, v8
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v6, 0x3f317217, v0, -v3
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v2
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v7, 0x3f317217, v1, -v4
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v6, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3f317217, v2, -v5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v7, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v4, v4, v7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v8, 0x3377d1cf, v2 :: v_dual_mov_b32 v3, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v10
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
@@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v4, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v4, v8
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
-; SI-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 5, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v8, s10, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v8
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v9
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v3, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v4, v9
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT:    v_log_f32_e32 v9, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v8|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v9
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v4, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v7, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v9|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v1
@@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v1, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v1, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, s10, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v5, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; VI-GISEL-NEXT:    v_ldexp_f32 v6, s10, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v6, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v10, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v9, v10, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317000, v8
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v7|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
+; VI-GISEL-NEXT:    v_log_f32_e32 v7, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v7, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v3, -v8
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 5, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v9, s10, v9
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v9
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v3, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v10
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v3, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v5, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v8, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v10|, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v5
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v7
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3f317217, v3, -v8
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v7, v7, v12
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v10, 0x3f317217, v0, -v5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v11, 0x3f317217, v1, -v6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v7
-; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3f317217, v3, -v8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v15
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_v4f32:
@@ -2126,10 +2165,10 @@ define float @v_log_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2175,16 +2214,16 @@ define float @v_log_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2224,10 +2263,10 @@ define float @v_log_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2270,21 +2309,22 @@ define float @v_log_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2329,10 +2369,10 @@ define float @v_log_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2378,16 +2418,16 @@ define float @v_log_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2427,10 +2467,10 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2475,20 +2515,22 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2534,10 +2576,10 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2583,16 +2625,16 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2632,10 +2674,10 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2680,20 +2722,22 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2740,10 +2784,10 @@ define float @v_log_fneg_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2789,16 +2833,16 @@ define float @v_log_fneg_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2838,10 +2882,10 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2885,20 +2929,22 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3304,10 +3350,10 @@ define float @v_log_f32_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -3353,16 +3399,16 @@ define float @v_log_f32_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -3402,10 +3448,10 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -3448,21 +3494,22 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4038,10 +4085,10 @@ define float @v_log_f32_nnan(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4087,16 +4134,16 @@ define float @v_log_f32_nnan(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4136,10 +4183,10 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4182,21 +4229,22 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4381,10 +4429,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4430,16 +4478,16 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4479,10 +4527,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4525,21 +4573,22 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4724,10 +4773,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4773,16 +4822,16 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4822,10 +4871,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4868,21 +4917,22 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4924,10 +4974,10 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4967,16 +5017,16 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5010,10 +5060,10 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5051,18 +5101,20 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5207,10 +5259,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5250,16 +5302,16 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5293,10 +5345,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5334,18 +5386,20 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5419,10 +5473,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5468,16 +5522,16 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5517,10 +5571,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5563,21 +5617,22 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index fd50d1b..d09df75 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
@@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x411a209b
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x411a209b, s2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_f32:
@@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v2, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
@@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT:    v_and_b32_e32 v4, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s6, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v1, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v1, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v3, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s10, v2
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v3, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v4, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, 0x3e9a209a, v0 :: v_dual_mul_f32 v3, 0x3e9a209a, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v3, 0x3e9a209a, v1 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3e9a209a, v1, -v3
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v5, 0x3284fbcf, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v4, 0x3e9a209a, v0, -v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3e9a209a, v1, -v3
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v4, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
@@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
-; SI-GISEL-NEXT:    v_log_f32_e32 v6, v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v5, s9, v5
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v2, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v3, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v8
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v8|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
@@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v3
@@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s9, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
 ; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x411a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v3, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v2, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v6, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v6|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
@@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v2, -v6
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v6, s9, v6
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v2, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v2, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v4, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v9|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v2 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3e9a209a, v2, -v5
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v8, 0x3284fbcf, v2
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v5, v5, v8
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v6, 0x3e9a209a, v0, -v3
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v2
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v7, 0x3e9a209a, v1, -v4
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v6, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3e9a209a, v2, -v5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v7, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v4, v4, v7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v8, 0x3284fbcf, v2 :: v_dual_mov_b32 v3, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v10
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
@@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v4, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v4, v8
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
-; SI-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 5, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v8, s10, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v8
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v9
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v3, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v4, v9
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT:    v_log_f32_e32 v9, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v8|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v9
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v4, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v7, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v9|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v1
@@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x411a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v1, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v1, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, s10, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v5, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; VI-GISEL-NEXT:    v_ldexp_f32 v6, s10, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v6, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v10, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v9, v10, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a2000, v8
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v7|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
+; VI-GISEL-NEXT:    v_log_f32_e32 v7, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v7, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v3, -v8
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 5, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v9, s10, v9
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v9
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v3, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v10
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v3, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v5, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v8, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v10|, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v5
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v7
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3e9a209a, v3, -v8
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v7, v7, v12
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v10, 0x3e9a209a, v0, -v5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v11, 0x3e9a209a, v1, -v6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v7
-; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3e9a209a, v3, -v8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v15
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_v4f32:
@@ -2126,10 +2165,10 @@ define float @v_log10_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2175,16 +2214,16 @@ define float @v_log10_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2224,10 +2263,10 @@ define float @v_log10_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2270,21 +2309,22 @@ define float @v_log10_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2329,10 +2369,10 @@ define float @v_log10_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2378,16 +2418,16 @@ define float @v_log10_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2427,10 +2467,10 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2475,20 +2515,22 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2534,10 +2576,10 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2583,16 +2625,16 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2632,10 +2674,10 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2680,20 +2722,22 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2740,10 +2784,10 @@ define float @v_log10_fneg_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2789,16 +2833,16 @@ define float @v_log10_fneg_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2838,10 +2882,10 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2885,20 +2929,22 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3304,10 +3350,10 @@ define float @v_log10_f32_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -3353,16 +3399,16 @@ define float @v_log10_f32_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -3402,10 +3448,10 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -3448,21 +3494,22 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4038,10 +4085,10 @@ define float @v_log10_f32_nnan(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4087,16 +4134,16 @@ define float @v_log10_f32_nnan(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4136,10 +4183,10 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4182,21 +4229,22 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4381,10 +4429,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4430,16 +4478,16 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4479,10 +4527,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4525,21 +4573,22 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4724,10 +4773,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4773,16 +4822,16 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4822,10 +4871,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4868,21 +4917,22 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4924,10 +4974,10 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4967,16 +5017,16 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a2000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5010,10 +5060,10 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5051,18 +5101,20 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5207,10 +5259,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5250,16 +5302,16 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a2000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5293,10 +5345,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5334,18 +5386,20 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5419,10 +5473,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5468,16 +5522,16 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5517,10 +5571,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5563,21 +5617,22 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 2c5a9f5..8b3b79b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -36,14 +36,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s2, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -74,13 +74,13 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -108,20 +108,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX900-GISEL-LABEL: s_log2_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -147,20 +146,22 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX1100-GISEL-LABEL: s_log2_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_f32:
@@ -242,21 +243,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s6, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v3, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -291,21 +293,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s6, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -339,22 +342,23 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, s10, v3
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v3, s10, v3
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -387,23 +391,28 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL-LABEL: s_log2_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v2f32:
@@ -506,32 +515,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; SI-GISEL-LABEL: s_log2_v3f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v3, s9, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_log_f32_e32 v4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
 ; SI-GISEL-NEXT:    s_endpgm
@@ -571,32 +582,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; VI-GISEL-LABEL: s_log2_v3f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
-; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
-; VI-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; VI-GISEL-NEXT:    v_log_f32_e32 v4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; VI-GISEL-NEXT:    s_endpgm
 ;
@@ -637,28 +650,30 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v4, s1, v4
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s2, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v5, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -702,33 +717,40 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v3f32:
 ; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b96 v6, v[0:2], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v3f32:
@@ -865,34 +887,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v4, s10, v4
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
@@ -942,33 +967,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v4, s10, v4
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; VI-GISEL-NEXT:    v_log_f32_e32 v5, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1018,34 +1046,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v5, s10, v5
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v6, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1095,39 +1126,46 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v4f32:
 ; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 0x42000000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x42000000, s8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v3, 5, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x42000000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 0x42000000, s9
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7
-; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v3, v3, v7 :: v_dual_lshlrev_b32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX1100-GISEL-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v4f32:
@@ -1243,19 +1281,19 @@ define float @v_log2_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1271,6 +1309,20 @@ define float @v_log2_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1285,6 +1337,20 @@ define float @v_log2_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1304,10 +1370,12 @@ define float @v_log2_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1341,19 +1409,19 @@ define float @v_log2_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fabs_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1369,6 +1437,20 @@ define float @v_log2_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1383,6 +1465,20 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1403,10 +1499,11 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1441,19 +1538,19 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fneg_fabs_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1469,6 +1566,20 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1483,6 +1594,20 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1503,10 +1628,11 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1542,19 +1668,19 @@ define float @v_log2_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fneg_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fneg_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1570,6 +1696,20 @@ define float @v_log2_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fneg_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1584,6 +1724,20 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fneg_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1604,10 +1758,11 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1642,19 +1797,19 @@ define float @v_log2_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_fast:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_fast:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_fast:
 ; VI-SDAG:       ; %bb.0:
@@ -1670,6 +1825,20 @@ define float @v_log2_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_fast:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1684,6 +1853,20 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_fast:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1703,10 +1886,12 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1740,19 +1925,19 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1768,6 +1953,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1782,6 +1981,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1801,10 +2014,12 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1838,19 +2053,19 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_approx_fn_attr:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1866,6 +2081,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1880,6 +2109,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1899,10 +2142,12 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1936,19 +2181,19 @@ define float @v_log2_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_ninf:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -1964,6 +2209,20 @@ define float @v_log2_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1978,6 +2237,20 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1997,10 +2270,12 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2034,19 +2309,19 @@ define float @v_log2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_afn:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -2062,6 +2337,20 @@ define float @v_log2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2076,6 +2365,20 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2095,10 +2398,12 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2158,19 +2463,19 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_afn_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2186,6 +2491,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2200,6 +2519,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2219,10 +2552,12 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2256,19 +2591,19 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_fabs_log2_f32_afn:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_fabs_log2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -2284,6 +2619,20 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_fabs_log2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2298,6 +2647,20 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_fabs_log2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2318,10 +2681,11 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2382,19 +2746,19 @@ define float @v_log2_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
@@ -2410,6 +2774,20 @@ define float @v_log2_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2424,6 +2802,20 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2443,10 +2835,12 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2506,19 +2900,19 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2534,6 +2928,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2548,6 +2956,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2567,10 +2989,12 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2630,19 +3054,19 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_ninf_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2658,6 +3082,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2672,6 +3110,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2691,10 +3143,12 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2728,19 +3182,19 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -2756,6 +3210,20 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2770,6 +3238,20 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2789,10 +3271,12 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2852,19 +3336,19 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2880,6 +3364,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2894,6 +3392,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2913,10 +3425,12 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2976,19 +3490,19 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_dynamic_mode:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
@@ -3004,6 +3518,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3018,6 +3546,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3037,10 +3579,12 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 4de0c54..795ed6d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
 
 declare half @llvm.rint.f16(half %a)
 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -63,6 +64,24 @@ define amdgpu_kernel void @rint_f16(
 ; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: rint_f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -168,6 +187,28 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: rint_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_rndne_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 2bb89fd..6927636 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -4,6 +4,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
 
 define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX6-LABEL: sin_f16:
@@ -80,6 +81,19 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    v_sin_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sin_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sin_f16_e32 v1, v1
+; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.sin.f16(half %a.val)
   store half %r.val, ptr addrspace(1) %r
@@ -188,6 +202,24 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sin_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX12-NEXT:    v_sin_f16_e32 v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT:    v_sin_f16_e32 v2, v2
+; GFX12-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a.val = load <2 x half>, ptr addrspace(1) %a
   %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val)
   store <2 x half> %r.val, ptr addrspace(1) %r
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 47777e3..0d58afd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
 
 declare half @llvm.trunc.f16(half %a)
 declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
@@ -62,6 +63,24 @@ define amdgpu_kernel void @trunc_f16(
 ; GFX11-NEXT:    v_trunc_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: trunc_f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -147,6 +166,28 @@ define amdgpu_kernel void @trunc_v2f16(
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: trunc_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_trunc_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 33007e5..3be17f9 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1333,5 +1333,668 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
   ret i48 %a
 }
 
+define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_1:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xfc19
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xfc19
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xfffffffffffffc19
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xd1
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xd1
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xd1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffffff000000d1
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_3:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xfc88
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_3:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xfc88
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 s0xfffffffffffffc88, %lsh
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_4:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mul_lo_u32 v3, v2, v0
+; CI-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v0, 0
+; CI-NEXT:    s_movk_i32 s4, 0xfc88
+; CI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[1:2]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_4:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v2, v2, v0
+; SI-NEXT:    v_mul_hi_u32 v3, v1, v0
+; SI-NEXT:    s_movk_i32 s4, 0xfc88
+; SI-NEXT:    v_mul_lo_u32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; SI-NEXT:    v_mul_hi_u32 v3, v2, s4
+; SI-NEXT:    v_mul_lo_u32 v1, v2, s4
+; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v0, v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i32 %arg0 to i64
+  %mul1 = mul i64 %arg1, %ext
+  %lsh = lshr i64 %mul1, 32
+  %mul2 = mul i64 %lsh, s0xfffffffffffffc88
+  %mad = add i64 %mul2, %mul1
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_1:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; CI-NEXT:    s_movk_i32 s4, 0xfc19
+; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; SI-NEXT:    s_movk_i32 s4, 0xfc19
+; SI-NEXT:    v_mul_lo_u32 v3, v2, s4
+; SI-NEXT:    v_mul_hi_i32 v2, v2, s4
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
+; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_i64_i32 v[2:3], null, 0xfffffc19, v4, v[0:1]
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_1:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 36
+  %mul = mul i64 %lsh, s0xfffffffffffffc19
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xd1
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v0
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xd1
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v4, v1, s4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v1
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xd1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffff00000000d1
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_3:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 22
+; CI-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_3:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 22
+; SI-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = add i64 %arg0, 1
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xfffffffffffffc00
+  %mad = add i64 %mul, %op
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_4:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; CI-NEXT:    v_mul_lo_u32 v0, v1, v1
+; CI-NEXT:    v_add_i32_e32 v1, vcc, v0, v3
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_4:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_hi_u32 v2, v1, v0
+; SI-NEXT:    v_mul_lo_u32 v3, v1, v1
+; SI-NEXT:    v_mul_lo_u32 v4, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_4:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1]
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_4:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v0, v4
+; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1150-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, %arg0
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_sgpr:
+; CI:       ; %bb.0:
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v2, 0xffff1c18
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s1, v2, v[0:1]
+; CI-NEXT:    v_subrev_i32_e32 v1, vcc, s1, v1
+; CI-NEXT:    v_readfirstlane_b32 s0, v0
+; CI-NEXT:    v_readfirstlane_b32 s1, v1
+; CI-NEXT:    ; return to shader part epilog
+;
+; SI-LABEL: lshr_mad_i64_sgpr:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_mov_b32_e32 v0, 0xffff1c18
+; SI-NEXT:    v_mul_hi_u32 v0, s1, v0
+; SI-NEXT:    s_mul_i32 s2, s1, 0xffff1c18
+; SI-NEXT:    v_readfirstlane_b32 s3, v0
+; SI-NEXT:    s_sub_i32 s3, s3, s1
+; SI-NEXT:    s_add_u32 s0, s2, s0
+; SI-NEXT:    s_addc_u32 s1, s3, s1
+; SI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: lshr_mad_i64_sgpr:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mul_hi_u32 s2, s1, 0xffff1c18
+; GFX9-NEXT:    s_sub_i32 s2, s2, s1
+; GFX9-NEXT:    s_mul_i32 s3, s1, 0xffff1c18
+; GFX9-NEXT:    s_add_u32 s0, s3, s0
+; GFX9-NEXT:    s_addc_u32 s1, s2, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: lshr_mad_i64_sgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mul_hi_u32 s2, s1, 0xffff1c18
+; GFX11-NEXT:    s_mul_i32 s3, s1, 0xffff1c18
+; GFX11-NEXT:    s_sub_i32 s2, s2, s1
+; GFX11-NEXT:    s_add_u32 s0, s3, s0
+; GFX11-NEXT:    s_addc_u32 s1, s2, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: lshr_mad_i64_sgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s4, 0xffff1c18
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_mov_b32 s5, -1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT:    ; return to shader part epilog
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffffffffff1c18
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_vec:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, 0xffff1c18
+; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    s_mov_b32 s4, 0xffff1118
+; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
+; CI-NEXT:    v_mov_b32_e32 v0, v4
+; CI-NEXT:    v_mov_b32_e32 v2, v6
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_vec:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0xffff1118
+; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
+; SI-NEXT:    v_mul_hi_u32 v5, v3, s4
+; SI-NEXT:    s_mov_b32 s4, 0xffff1c18
+; SI-NEXT:    v_mul_hi_u32 v6, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v7, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
+; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_vec:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff1c18
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff1118
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_vec:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX11-NEXT:    v_mov_b32_e32 v2, v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_vec:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX12-NEXT:    v_mov_b32_e32 v2, v6
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
+  %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
+  %mad = add <2 x i64> %mul, %arg0
+
+  ret <2 x i64> %mad
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 5e46fd6..fa15a42 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -1838,11 +1838,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v3f16:
@@ -1904,8 +1904,8 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v3f16_nnan:
@@ -1947,20 +1947,20 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v4f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v4f16:
@@ -2020,12 +2020,12 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v4f16_nnan:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v4f16_nnan:
@@ -2067,27 +2067,27 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v6f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v6f16:
@@ -2159,34 +2159,34 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v8f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 9e0b7da..f5fb85d 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -1792,11 +1792,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_min_f16_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v3f16:
@@ -1858,8 +1858,8 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v3f16_nnan:
@@ -1901,20 +1901,20 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v4f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v4f16:
@@ -1974,12 +1974,12 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v4f16_nnan:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_min_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v4f16_nnan:
@@ -2021,27 +2021,27 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v6f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v5
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_min_f16_e32 v2, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v6f16:
@@ -2113,34 +2113,34 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v8f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_min_f16_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 73f3d4c..774a22f 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s
 
 define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_minmax_i32:
@@ -8,6 +10,16 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
   ret i32 %sminmax
@@ -45,6 +57,16 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
   ret i32 %sminmax
@@ -56,6 +78,16 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
   ret i32 %smaxmin
@@ -67,6 +99,16 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
   ret i32 %smaxmin
@@ -79,6 +121,17 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-NEXT:    v_med3_i32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_smed3_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
   %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
   %tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
@@ -93,6 +146,16 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
   ret i32 %uminmax
@@ -130,6 +193,16 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
   ret i32 %uminmax
@@ -141,6 +214,16 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
   ret i32 %umaxmin
@@ -152,6 +235,16 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
   ret i32 %umaxmin
@@ -164,6 +257,17 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-NEXT:    v_med3_u32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_umed3_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_u32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
   %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
   %tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
@@ -173,44 +277,88 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 }
 
 define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_minmax_f32_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_minmax_f32_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %minmax = call float @llvm.minnum.f32(float %max, float %c)
   ret float %minmax
 }
 
 define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f32_ieee_false:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT:    s_mov_b32 s5, s4
-; SDAG-NEXT:    s_mov_b32 s4, s3
-; SDAG-NEXT:    v_maxmin_f32 v0, s0, s1, v0
-; SDAG-NEXT:    global_store_b32 v1, v0, s[4:5]
-; SDAG-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT:    v_maxmin_f32 v0, s0, s1, v0
+; SDAG-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX11-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_test_minmax_f32_ieee_false:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT:    s_mov_b32 s6, s3
-; GISEL-NEXT:    s_mov_b32 s7, s4
-; GISEL-NEXT:    v_maxmin_f32 v0, s0, s1, v0
-; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GISEL-GFX11-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT:    v_maxmin_f32 v0, s0, s1, v0
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT:    v_maxmin_num_f32 v0, s0, s1, v0
+; SDAG-GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX12-NEXT:    s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_max_num_f32 s0, s0, s1
+; GISEL-GFX12-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-GFX12-NEXT:    s_min_num_f32 s0, s0, s2
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT:    s_endpgm
   %smax = call float @llvm.maxnum.f32(float %a, float %b)
   %sminmax = call float @llvm.minnum.f32(float %smax, float %c)
   store float %sminmax, ptr addrspace(1) %out
@@ -222,27 +370,56 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_commuted_f32_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %minmax = call float @llvm.minnum.f32(float %c, float %max)
   ret float %minmax
 }
 
 define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_maxmin_f32_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT:    v_minmax_f32 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_maxmin_f32_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT:    v_minmax_f32 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %maxmin = call float @llvm.maxnum.f32(float %min, float %c)
   ret float %maxmin
@@ -253,6 +430,11 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_commuted_f32_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %maxmin = call float @llvm.maxnum.f32(float %c, float %min)
   ret float %maxmin
@@ -265,6 +447,17 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
 ; GFX11-NEXT:    v_med3_f32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_num_f32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
   %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
@@ -278,29 +471,54 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_f16_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %max, half %c)
   ret half %minmax
 }
 
 define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT:    s_mov_b32 s5, s4
-; SDAG-NEXT:    s_mov_b32 s4, s3
-; SDAG-NEXT:    v_maxmin_f16 v0, s0, s1, v0
-; SDAG-NEXT:    global_store_b16 v1, v0, s[4:5]
-; SDAG-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; SDAG-GFX11-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX11-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_test_minmax_f16_ieee_false:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT:    s_mov_b32 s6, s3
-; GISEL-NEXT:    s_mov_b32 s7, s4
-; GISEL-NEXT:    v_maxmin_f16 v0, s0, s1, v0
-; GISEL-NEXT:    global_store_b16 v1, v0, s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; GISEL-GFX11-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT:    v_maxmin_num_f16 v0, s0, s1, v0
+; SDAG-GFX12-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX12-NEXT:    s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_max_num_f16 s0, s0, s1
+; GISEL-GFX12-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-GFX12-NEXT:    s_min_num_f16 s0, s0, s2
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX12-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT:    s_endpgm
   %smax = call half @llvm.maxnum.f16(half %a, half %b)
   %sminmax = call half @llvm.minnum.f16(half %smax, half %c)
   store half %sminmax, ptr addrspace(1) %out
@@ -308,23 +526,49 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 }
 
 define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GISEL-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %c, half %max)
   ret half %minmax
@@ -335,29 +579,60 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_f16_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %min, half %c)
   ret half %maxmin
 }
 
 define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GISEL-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %c, half %min)
   ret half %maxmin
@@ -370,6 +645,17 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
 ; GFX11-NEXT:    v_med3_f16 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_num_f16 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
   %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
   %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
new file mode 100644
index 0000000..a9b8663
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 %s -o - | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 %s -o - | FileCheck %s --check-prefixes=GFX908
+
+define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
+; GFX942-LABEL: matmul_kernel:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX942-NEXT:    s_mov_b32 s3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX942-NEXT:    s_branch .LBB0_2
+; GFX942-NEXT:  .LBB0_1: ; %bb2
+; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX942-NEXT:    s_or_b32 s4, s3, 1
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    s_and_b32 s3, s5, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_4
+; GFX942-NEXT:  .LBB0_2: ; %bb
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    ; implicit-def: $sgpr3
+; GFX942-NEXT:  .LBB0_4: ; %common.ret
+; GFX942-NEXT:    s_endpgm
+;
+; GFX908-LABEL: matmul_kernel:
+; GFX908:       ; %bb.0: ; %entry
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    s_mov_b32 s2, 0
+; GFX908-NEXT:    s_mov_b32 s3, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX908-NEXT:    s_branch .LBB0_2
+; GFX908-NEXT:  .LBB0_1: ; %bb2
+; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX908-NEXT:    s_or_b32 s4, s3, 1
+; GFX908-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX908-NEXT:    s_mov_b32 s3, s2
+; GFX908-NEXT:    s_nop 3
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_mov_b32_e32 v5, s3
+; GFX908-NEXT:    v_mov_b32_e32 v4, s2
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    s_and_b32 s3, s5, s4
+; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX908-NEXT:    s_cbranch_execz .LBB0_4
+; GFX908-NEXT:  .LBB0_2: ; %bb
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    s_cbranch_vccz .LBB0_1
+; GFX908-NEXT:  ; %bb.3:
+; GFX908-NEXT:    ; implicit-def: $sgpr3
+; GFX908-NEXT:  .LBB0_4: ; %common.ret
+; GFX908-NEXT:    s_endpgm
+entry:
+  br label %bb
+
+bb:
+  %i = phi { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } [ %i10, %bb2 ], [ zeroinitializer, %entry ]
+  %i1 = phi i32 [ %i5, %bb2 ], [ 0, %entry ]
+  %c0 = icmp ne i32 %a0, 0
+  br i1 %c0, label %bb2, label %bb11
+
+bb2:
+  %i3 = or i32 %i1, 1
+  %i4 = icmp slt i32 %i1, 0
+  %i5 = select i1 %i4, i32 %i3, i32 0
+  %i6 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 123
+  %i7 = insertelement <4 x float> zeroinitializer, float %i6, i32 0
+  %i8 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i7, i32 0, i32 0, i32 0)
+  %i9 = extractelement <4 x float> %i8, i32 0
+  %i10 = insertvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } zeroinitializer, float %i9, 123
+  br label %bb
+
+bb11:
+  %c1 = icmp ne i32 %a1, 0
+  br i1 %c1, label %bb12, label %common.ret
+
+common.ret:
+  ret void
+
+bb12:
+  %i13 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 0
+  %i14 = insertelement <4 x float> zeroinitializer, float %i13, i32 0
+  %i15 = insertelement <4 x float> %i14, float 0.000000e+00, i32 0
+  %i16 = insertelement <4 x float> %i15, float 0.000000e+00, i32 0
+  br label %common.ret
+}
+
+; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
new file mode 100644
index 0000000..5c83170
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -0,0 +1,235 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE
+
+...
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   S_BITCMP1_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], 1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:agpr_32 = PHI [[COPY1]], %bb.0, %24, %bb.3
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %11, %bb.3
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:agpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI1]], 1, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[PHI1]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_ASHR_I32_]], killed [[S_OR_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:areg_128_align2 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[COPY5]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, [[S_AND_B32_]], %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:agpr_32 = PHI [[COPY3]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_]].sub0, %bb.2
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.1, [[S_MOV_B64_1]], %bb.2
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI4]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_1]], 1, implicit $exec
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  ;
+  ; COALESCE-LABEL: name: test
+  ; COALESCE: bb.0:
+  ; COALESCE-NEXT:   successors: %bb.1(0x80000000)
+  ; COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
+  ; COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; COALESCE-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.1:
+  ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+  ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+  ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+  ; COALESCE-NEXT:   S_BRANCH %bb.2
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.2:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
+  ; COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
+  ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.3:
+  ; COALESCE-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+  ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
+  ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
+  ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  ; COALESCE-NEXT:   S_BRANCH %bb.4
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.4:
+  ; COALESCE-NEXT:   successors: %bb.5(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.5:
+  ; COALESCE-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX908-COALESCE-LABEL: name: test
+  ; GFX908-COALESCE: bb.0:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; GFX908-COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX908-COALESCE-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.1:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+  ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.2
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.2:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.3:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.4
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.4:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.5:
+  ; GFX908-COALESCE-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    S_BITCMP1_B32 killed %1, 0, implicit-def $scc
+    %2:sgpr_32 = S_MOV_B32 0
+    %3:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %5:sreg_32 = IMPLICIT_DEF
+    %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %3, implicit $exec
+    %7:sreg_64_xexec = V_CMP_NE_U32_e64 %6, 1, implicit $exec
+
+  bb.1:
+    successors: %bb.2, %bb.3
+
+    %8:vgpr_32 = PHI %4, %bb.0, %9, %bb.3
+    %10:sreg_32 = PHI %2, %bb.0, %11, %bb.3
+    %12:agpr_32 = COPY %8
+    %13:sreg_64 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, %7, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+
+    %14:sreg_32 = S_OR_B32 %10, 1, implicit-def dead $scc
+    %15:sreg_32 = S_ASHR_I32 %10, 31, implicit-def dead $scc
+    %16:sreg_32 = S_AND_B32 killed %15, killed %14, implicit-def dead $scc
+    %17:vreg_128_align2 = REG_SEQUENCE %8, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3
+    %18:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1
+    %19:vreg_64_align2 = COPY %18
+    %20:areg_128_align2 = COPY %17
+    %21:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %19, %19, killed %20, 0, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = COPY %21.sub0
+    %23:sreg_64 = S_MOV_B64 0
+
+  bb.3:
+    successors: %bb.4, %bb.1
+
+    %11:sreg_32 = PHI %5, %bb.1, %16, %bb.2
+    %24:agpr_32 = PHI %12, %bb.1, %21.sub0, %bb.2
+    %25:sreg_64_xexec = PHI %13, %bb.1, %23, %bb.2
+    %9:vgpr_32 = COPY %24
+    %26:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %25, implicit $exec
+    %27:sreg_64_xexec = V_CMP_NE_U32_e64 %26, 1, implicit $exec
+    $vcc = S_AND_B64 $exec, %27, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
new file mode 100644
index 0000000..49c0aaf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
@@ -0,0 +1,182 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE
+
+...
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_BITCMP0_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_4:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_4]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_3:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MFMA_F32_16X16X16F16_e64_3]].sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:agpr_32 = PHI [[V_ACCVGPR_WRITE_B32_e64_]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_3]].sub0, %bb.2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_PACK_B32_F16_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  ;
+  ; COALESCE-LABEL: name: test
+  ; COALESCE: bb.0:
+  ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
+  ; COALESCE-NEXT:   S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; COALESCE-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.1:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   S_BRANCH %bb.3
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.2:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.3:
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; COALESCE-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX908-COALESCE-LABEL: name: test
+  ; GFX908-COALESCE: bb.0:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX908-COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT:   S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.1:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.3
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.2:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.3:
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; GFX908-COALESCE-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.2, %bb.1
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sgpr_32 = S_MOV_B32 0
+    S_BITCMP0_B32 killed %1, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+
+  bb.1:
+    successors: %bb.3
+
+    %3:sgpr_32 = COPY %2
+    %4:vgpr_32 = COPY %3, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.3
+
+    %5:sgpr_32 = S_MOV_B32 0
+    %6:vgpr_32 = COPY %5
+    %7:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %8:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %9:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %10:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %11:areg_128_align2 = REG_SEQUENCE %7, %subreg.sub0, %8, %subreg.sub1, %9, %subreg.sub2, %10, %subreg.sub3
+    %12:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %5, %subreg.sub1
+    %13:vreg_64_align2 = COPY %12
+    %14:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %11, 0, 0, 0, implicit $mode, implicit $exec
+    %15:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %14, 0, 0, 0, implicit $mode, implicit $exec
+    %16:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %15, 0, 0, 0, implicit $mode, implicit $exec
+    %17:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %16, 0, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = COPY %17.sub0
+    %19:vgpr_32 = COPY %18
+
+  bb.3:
+    %20:vgpr_32 = PHI %4, %bb.1, %19, %bb.2
+    %21:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, %20, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %24:vreg_64_align2 = REG_SEQUENCE %22, %subreg.sub0, killed %23, %subreg.sub1
+    %25:sgpr_128 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1, %2, %subreg.sub2, %2, %subreg.sub3
+    %26:vreg_64_align2 = COPY %24
+    BUFFER_STORE_DWORDX2_OFFSET_exact killed %26, killed %25, %2, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 98d5f30..a2a0107 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -1372,20 +1372,19 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf000
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf800
-; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[3:4]
-; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
+; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[3:4]
+; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 1, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v4
 ; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v7
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v5
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v6, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
@@ -1416,32 +1415,32 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
 ; GFX9-NEXT:    s_movk_i32 s0, 0xf000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Offset64:
@@ -1477,8 +1476,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off
@@ -1517,25 +1515,25 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[6:7], v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[4:5], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:2048
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -2408,18 +2406,17 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v0, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v0
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
+; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2450,14 +2447,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x1000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:-2048
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
@@ -2490,15 +2486,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
@@ -2525,19 +2520,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index ba428df..a439f8d 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -3,32 +3,17 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
-; GFX12-SDAG-LABEL: v_s_exp_f32:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42800000, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-SDAG-NEXT:    s_add_f32 s0, s0, s1
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0xffffffc0, 0
-; GFX12-SDAG-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, s0, s1
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: v_s_exp_f32:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42800000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_add_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x1f800000, 1.0
-; GFX12-GISEL-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: v_s_exp_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-NEXT:    s_cselect_b32 s1, 0x42800000, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    s_add_f32 s0, s0, s1
+; GFX12-NEXT:    s_cselect_b32 s1, 0xffffffc0, 0
+; GFX12-NEXT:    v_s_exp_f32 s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT:    v_ldexp_f32 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %result = call float @llvm.exp2.f32(float %src)
   ret float %result
 }
@@ -88,16 +73,16 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
 ; GFX12-GISEL-LABEL: v_s_log_f32:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, s0, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %result = call float @llvm.log2.f32(float %src)
   ret float %result
@@ -322,19 +307,18 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
 ;
 ; GFX12-GISEL-LABEL: srcmods_abs_f32:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s0, 0x7fffffff
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s1, 0x800000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, |s0|, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %abs = call float @llvm.fabs.f32(float %src)
   %result = call float @llvm.log2.f32(float %abs)
@@ -362,19 +346,18 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
 ;
 ; GFX12-GISEL-LABEL: srcmods_neg_f32:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX12-GISEL-NEXT:    s_xor_b32 s1, s0, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s1, 0x800000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, -s0, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %neg = fneg float %src
   %result = call float @llvm.log2.f32(float %neg)
diff --git a/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir
new file mode 100644
index 0000000..3879f6d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# This used to assert due to trying to rematerialize V_MOV_B64_PSEUDO
+# at copy to $vgpr1. This would assert since this would clobber the
+# live value in $vgpr0.
+
+---
+name: rematerialize_physreg_sub_def_already_live_at_def_assert
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: $vgpr1 = COPY [[V_MOV_B]].sub1
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = COPY %1
+    SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+...
+
+# Same as previous, except with an IMPLICIT_DEF
+---
+name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: $vgpr1 = COPY [[DEF]].sub1
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = COPY %1
+    SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+...
+
+---
+name: rematerialize_physreg_sub_def_no_live_sub_def_0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_0
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr1 = COPY %1
+    SI_RETURN implicit killed $vgpr1
+...
+
+---
+name: rematerialize_physreg_sub_def_no_live_sub_def_1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_1
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead $vgpr1_vgpr2 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub0
+    $vgpr1 = COPY %1
+    SI_RETURN implicit killed $vgpr1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 3e8768c..96dd627 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1065,100 +1065,37 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_sdiv24_48:
-; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_sext_i32_i16 s1, s1
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 24
-; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[2:3], 16
-; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
-; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_sub_u32 s12, s6, s0
-; GCN-IR-NEXT:    s_subb_u32 s13, s7, s0
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
-; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s16, s14, s20
-; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[10:11], s[18:19]
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[18:19], exec
-; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s13
-; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s12
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
-; GCN-IR-NEXT:    s_addc_u32 s19, s17, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s16
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s18
-; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s20
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
-; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[14:15], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s18, s16
-; GCN-IR-NEXT:    s_subb_u32 s8, s19, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s14
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s15
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT:  .LBB9_4: ; %Flow4
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[6:7]
-; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[10:11], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s0, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s1, s3, s1
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT:    s_waitcnt expcnt(0)
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_sext_i32_i16 s1, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    v_alignbit_b32 v0, s1, v0, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_sext_i32_i16 s0, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-IR-NEXT:    v_alignbit_b32 v2, s0, v2, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v0, 1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v3, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v1, v3
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index e0d0ddc..ddf6297 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
@@ -98,6 +100,8 @@ declare i64 @llvm.smin.i64(i64, i64)
 ; VI: v_max_i16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
 ; VI: v_min_i16_e32 {{v[0-9]}}, 17, [[MAX]]
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -686,6 +690,8 @@ bb:
 ; VI: v_max_i16
 
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_smed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -707,6 +713,8 @@ bb:
 
 ; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1:
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @v_test_smed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index cb8f82d..23364e8 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1188,109 +1188,39 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem24_48:
-; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_sext_i32_i16 s1, s1
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 24
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], 16
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 31
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 16
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
-; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
-; GCN-IR-NEXT:    s_mov_b32 s11, s10
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s10
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[16:17]
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[16:17], exec
-; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s3
-; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s2
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
-; GCN-IR-NEXT:    s_addc_u32 s17, s15, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s14
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s16
-; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s20
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
-; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s18, s14
-; GCN-IR-NEXT:    s_subb_u32 s8, s19, s15
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT:  .LBB9_4: ; %Flow4
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-IR-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x9
-; GCN-IR-NEXT:    s_mul_i32 s4, s6, s11
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT:    s_mul_i32 s4, s7, s10
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT:    s_mul_i32 s4, s6, s10
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-IR-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s1, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
-; GCN-IR-NEXT:    s_mov_b32 s15, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s14, -1
-; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v0, v2, vcc
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    buffer_store_short v0, off, s[12:15], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v1, off, s[12:15], 0
+; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_alignbit_b32 v0, s5, v0, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-IR-NEXT:    v_alignbit_b32 v2, s3, v2, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v5, v2, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v5, 1, v5
+; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_mad_f32 v3, -v4, v1, v3
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
new file mode 100644
index 0000000..f52f116
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Reduce a 64-bit sub by a constant if we know the low 32-bits are all
+; zero.
+
+; sub i64:x, K if computeTrailingZeros(K) >= 32
+; => build_pair (sub x.hi, K.hi), x.lo
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0xfffc0000
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -2
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x80000000
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_4(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0xfffc0000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_1(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_2(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_3(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_4(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_high_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, 1
+; GFX9-NEXT:    s_addc_u32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_high_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %sub
+}
+
+define <2 x i64> @v_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_sub_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %sub
+}
+
+define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_sub_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %sub
+}
+
+define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    s_add_i32 s3, s3, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %sub
+}
+
+define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    s_add_i32 s3, s3, -2
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %sub
+}
+
+; We could reduce this to use a 32-bit sub if we use computeKnownBits
+define i64 @v_sub_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) {
+; GFX9-LABEL: v_sub_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %sub = sub i64 %reg, %in.high.bits
+  ret i64 %sub
+}
+
+; We could reduce this to use a 32-bit sub if we use computeKnownBits
+define amdgpu_ps i64 @s_sub_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) {
+; GFX9-LABEL: s_sub_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_u32 s0, s0, 0
+; GFX9-NEXT:    s_subb_u32 s1, s1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %sub = sub i64 %reg, %in.high.bits
+  ret i64 %sub
+}
diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
new file mode 100644
index 0000000..8315708
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -stress-regalloc=4 -verify-regalloc -start-before=greedy,2 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+# This testcase hit a situation where greedy would hit a use after
+# free during last chance recoloring. This case successfully allocates
+# after, but is extremely sensitive to the exact allocation ordering.
+
+---
+name:            swdev502267_use_after_free_last_chance_recoloring_alloc_succeeds
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; CHECK-LABEL: name: swdev502267_use_after_free_last_chance_recoloring_alloc_succeeds
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr4_vgpr5, 0, 0, implicit $exec :: (volatile load (s128), addrspace 1)
+  ; CHECK-NEXT:   renamable $vgpr4 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr5 = V_FMA_F32_e64 0, $vgpr7, 0, $vgpr7, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_AV128_SAVE $vgpr6_vgpr7_vgpr8_vgpr9, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6 = V_FMA_F32_e64 0, killed $vgpr8, 0, $vgpr8, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr7 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_V128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr9 = COPY killed renamable $vgpr5
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9
+  ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX4 killed renamable $vgpr4_vgpr5, renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128), addrspace 1)
+  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0_vgpr1_vgpr2_vgpr3
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+    %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, implicit $exec :: (volatile load (s128), addrspace 1)
+    undef %4.sub0:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub0, 0, %3.sub0, 0, %0.sub3, 0, 0, implicit $mode, implicit $exec
+    %4.sub1:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub1, 0, %3.sub1, 0, %0.sub2, 0, 0, implicit $mode, implicit $exec
+    %4.sub2:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub2, 0, %3.sub2, 0, %0.sub1, 0, 0, implicit $mode, implicit $exec
+    %4.sub3:vreg_128_align2 = IMPLICIT_DEF
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    %5:vgpr_32 = V_FMA_F32_e64 0, %4.sub2, 0, %4.sub2, 0, %3.sub2, 0, 0, implicit $mode, implicit $exec
+    %6:vgpr_32 = V_TRUNC_F32_e32 %5, implicit $mode, implicit $exec
+    undef %7.sub3:vreg_128_align2 = nofpexcept V_DIV_FIXUP_F32_e64 0, %2, 0, %4.sub3, 0, %3.sub3, 0, 0, implicit $mode, implicit $exec
+    %7.sub2:vreg_128_align2 = nofpexcept V_FMA_F32_e64 1, %6, 0, %4.sub2, 0, %3.sub2, 0, 0, implicit $mode, implicit $exec
+    %7.sub0:vreg_128_align2 = nofpexcept V_DIV_FIXUP_F32_e64 0, %2, 0, %4.sub0, 0, %3.sub0, 0, 0, implicit $mode, implicit $exec
+    %8:vgpr_32 = V_FMA_F32_e64 0, %4.sub1, 0, %4.sub1, 0, %3.sub1, 0, 0, implicit $mode, implicit $exec
+    %9:vreg_128_align2 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5)
+    GLOBAL_STORE_DWORDX4 %1, %7, 0, 0, implicit $exec :: (volatile store (s128), addrspace 1)
+
+  bb.2:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %0
+    SI_RETURN implicit $vgpr0_vgpr1_vgpr2_vgpr3
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll
new file mode 100644
index 0000000..f0b3d33
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll
@@ -0,0 +1,23 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s
+; RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+; FIXME: This error will be fixed by supporting arbitrary divergent
+; dynamic allocas by performing a wave umax of the size.
+
+; ERR: error: <unknown>:0:0: in function move_to_valu_assert_srd_is_physreg_swdev503538 i32 (ptr addrspace(1)): illegal VGPR to SGPR copy
+
+; CHECK: ; illegal copy v0 to s32
+
+define i32 @move_to_valu_assert_srd_is_physreg_swdev503538(ptr addrspace(1) %ptr) {
+entry:
+  %idx = load i32, ptr addrspace(1) %ptr, align 4
+  %zero = extractelement <4 x i32> zeroinitializer, i32 %idx
+  %alloca = alloca [2048 x i8], i32 %zero, align 8, addrspace(5)
+  %ld = load i32, ptr addrspace(5) %alloca, align 8
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 2048, i1 false)
+  ret i32 %ld
+}
+
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 557d023..4726e81 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN  -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
@@ -84,6 +86,8 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i64(ptr addrspace(1) %out, ptr add
 ; VI: v_max_u16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
 ; VI: v_min_u16_e32 {{v[0-9]}}, 17, [[MAX]]
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -707,6 +711,8 @@ bb:
 ; VI: v_max_u16
 
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_umed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -728,6 +734,8 @@ bb:
 
 ; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1:
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_umed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
index ad4ad6d..b663acb 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
@@ -67,7 +67,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0_lo16, 16
+    ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0, 16
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     CFI_INSTRUCTION offset $vgpr0, 16
 
diff --git a/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir b/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir
new file mode 100644
index 0000000..92c983e
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - %s -mtriple=armv7-- -run-pass=machine-sink | FileCheck %s
+
+name: sink-store-load-dep
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 8, alignment: 8 }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sink-store-load-dep
+    ; CHECK:       bb.0:
+    ; CHECK:         [[LDRi12_:%[0-9]+]]:gpr = LDRi12 %stack.0, 0, 14 /* CC::al */, $noreg :: (load (s32))
+    ; CHECK-NEXT:    [[MOVi:%[0-9]+]]:gpr = MOVi 55296, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    [[ADDri1:%[0-9]+]]:gpr = ADDri [[LDRi12_:%[0-9]+]], 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    [[LDRH:%[0-9]+]]:gpr = LDRH killed [[ADDri1:%[0-9]+]], $noreg, 0, 14 /* CC::al */, $noreg :: (load (s16))
+    ; CHECK-NEXT:    [[MOVi1:%[0-9]+]]:gpr = MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    early-clobber %5:gpr = STRH_PRE [[MOVi:%[0-9]+]], [[LDRi12_:%[0-9]+]], [[MOVi1:%[0-9]+]], 0, 14 /* CC::al */, $noreg
+    ; CHECK-NEXT:    [[SUBri:%.*]]:gpr = SUBri killed [[LDRi12_:%[0-9]+]], 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK:       bb.2:
+    ; CHECK-NEXT:    [[MOVi2:%[0-9]+]]:gpr = MOVi [[LDRH:%[0-9]+]], 14 /* CC::al */, $noreg, $noreg
+    %0:gpr = LDRi12 %stack.0, 0, 14, $noreg :: (load (s32))
+    %1:gpr = MOVi 55296, 14, $noreg, $noreg
+    %2:gpr = ADDri %0:gpr, 0, 14, $noreg, $noreg
+    %3:gpr = LDRH killed %2:gpr, $noreg, 0, 14, $noreg :: (load (s16))
+    %4:gpr = MOVi 0, 14, $noreg, $noreg
+    early-clobber %5:gpr = STRH_PRE %1:gpr, %0:gpr, %4:gpr, 0, 14, $noreg
+    %6:gpr = SUBri killed %0:gpr, 0, 14, $noreg, $noreg
+    CMPri %6:gpr, 0, 14, $noreg, implicit-def $cpsr
+    Bcc %bb.2, 3, $cpsr
+    B %bb.1
+
+  bb.1:
+    %8:gpr = MOVi 0, 14, $noreg, $noreg
+    $r0 = COPY %8:gpr
+    BX_RET 14, $noreg, implicit $r0
+
+  bb.2:
+    %9:gpr = MOVi %3:gpr, 14, $noreg, $noreg
+    $r0 = COPY %9:gpr
+    BX_RET 14, $noreg, implicit $r0
+...
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
new file mode 100644
index 0000000..7c915e1
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
@@ -0,0 +1,2081 @@
+; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s
+; RUN: not llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - 2>&1 | FileCheck --check-prefix=AVR25 %s
+; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s
+
+; ATTINY85: <main>:
+; ATTINY85-NEXT: andi r24, 0x1
+; ATTINY85: cpi r24, 0x0
+; ATTINY85-NEXT: breq .+2
+; ATTINY85-NEXT: rjmp .+4086
+; ATTINY85: ldi r24, 0x3
+; ATTINY85-NEXT: ret
+
+; AVR25: error: out of range branch target (expected an integer in the range -4096 to 4095)
+
+; AVR3: <main>:
+; AVR3-NEXT: andi r24, 0x1
+; AVR3: cpi r24, 0x0
+; AVR3-NEXT: breq .+4
+; AVR3-NEXT: jmp 0x0
+; AVR3-NEXT: R_AVR_CALL .text+0x2
+; AVR3: ldi r24, 0x3
+; AVR3-NEXT: ret
+
+define i8 @main(i1 %a) {
+entry-block:
+  br label %hello
+hello:
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  br i1 %a, label %hello, label %finished
+finished:
+  ret i8 3
+}
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
new file mode 100644
index 0000000..24ddb36
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
@@ -0,0 +1,2081 @@
+; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s
+; RUN: not llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - 2>&1 | FileCheck --check-prefix=AVR25 %s
+; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s
+
+; ATTINY85: <main>:
+; ATTINY85-NEXT: andi r24, 0x1
+; ATTINY85-NEXT: cpi r24, 0x0
+; ATTINY85-NEXT: brne .+2
+; ATTINY85-NEXT: rjmp .-4092
+; ATTINY85: ldi r24, 0x3
+; ATTINY85-NEXT: ret
+
+; AVR25: error: out of range branch target (expected an integer in the range -4096 to 4095)
+
+; AVR3: <main>:
+; AVR3-NEXT: andi r24, 0x1
+; AVR3-NEXT: cpi r24, 0x0
+; AVR3-NEXT: brne .+4
+; AVR3-NEXT: jmp 0x0
+; AVR3-NEXT: R_AVR_CALL .text+0x100e
+; AVR3: ldi r24, 0x3
+; AVR3-NEXT: ret
+
+define i8 @main(i1 %a) {
+entry-block:
+  br i1 %a, label %hello, label %finished
+hello:
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  br label %finished
+finished:
+  ret i8 3
+}
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long.ll
deleted file mode 100644
index cd7a804..0000000
--- a/llvm/test/CodeGen/AVR/branch-relaxation-long.ll
+++ /dev/null
@@ -1,4162 +0,0 @@
-; RUN: llc < %s -mtriple=avr -mattr=avr3 | FileCheck %s
-; RUN: llc < %s -mtriple=avr -mattr=avr2 | FileCheck --check-prefix=AVR2 %s
-
-; CHECK-LABEL: relax_to_jmp:
-; CHECK: cpi     r{{[0-9]+}}, 0
-; CHECK: brne    [[BB1:.LBB[0-9]+_[0-9]+]]
-; CHECK: jmp     [[BB2:.LBB[0-9]+_[0-9]+]]
-; CHECK: [[BB1]]:
-; CHECK: nop
-; CHECK: [[BB2]]:
-
-;; A `RJMP` is generated instead of expected `JMP` for AVR2,
-;; and it is up to the linker to report 'out of range' or
-;; 'exceed flash maximum size'.
-; AVR2-LABEL: relax_to_jmp:
-; AVR2: cpi     r{{[0-9]+}}, 0
-; AVR2: brne    [[BB1:.LBB[0-9]+_[0-9]+]]
-; AVR2: rjmp    [[BB2:.LBB[0-9]+_[0-9]+]]
-; AVR2: [[BB1]]:
-; AVR2: nop
-; AVR2: [[BB2]]:
-
-define i8 @relax_to_jmp(i1 %a) {
-entry-block:
-  br i1 %a, label %hello, label %finished
-hello:
-  ; with >4 kB of instructions (2050 NOPs), this requires a long jump (jmp),
-  ; versus a relative one (rjmp).
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  br label %finished
-finished:
-  ret i8 3
-}
-
-; CHECK-LABEL: relax_to_jmp_backwards:
-; CHECK: [[BB1:.LBB[0-9]+_[0-9]+]]
-; CHECK: nop
-; CHECK: cpi     r{{[0-9]+}}, 0
-; CHECK: breq    [[BB2:.LBB[0-9]+_[0-9]+]]
-; CHECK: jmp     [[BB1]]
-; CHECK: [[BB2]]:
-
-;; A `RJMP` is generated instead of expected `JMP` for AVR2,
-;; and it is up to the linker to report 'out of range' or
-;; 'exceed flash maximum size'.
-; AVR2-LABEL: relax_to_jmp_backwards:
-; AVR2: [[BB1:.LBB[0-9]+_[0-9]+]]
-; AVR2: nop
-; AVR2: cpi     r{{[0-9]+}}, 0
-; AVR2: breq    [[BB2:.LBB[0-9]+_[0-9]+]]
-; AVR2: rjmp    [[BB1]]
-; AVR2: [[BB2]]:
-
-define i8 @relax_to_jmp_backwards(i1 %a) {
-entry-block:
-  br label %hello
-hello:
-  ; with >4 kB of instructions (2050 NOPs), this requires a long jump (jmp),
-  ; versus a relative one (rjmp).
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  br i1 %a, label %hello, label %finished
-finished:
-  ret i8 3
-}
diff --git a/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll b/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll
new file mode 100644
index 0000000..501f151
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+; Before SM6.2 ByteAddressBuffer and StructuredBuffer lower to bufferLoad.
+
+target triple = "dxil-pc-shadermodel6.1-compute"
+
+; CHECK-LABEL: define void @loadf32_struct
+define void @loadf32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", float, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 0)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+          target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_byte
+define void @loadv4f32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %offset, i32 0)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadnested
+define void @loadnested(i32 %index) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 0)
+  %loadi32 = call {i32, i1} @llvm.dx.resource.load.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 0)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 4)
+  %loadf32 = call {<4 x float>, i1} @llvm.dx.resource.load.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 4)
+
+  ; CHECK: [[DATAF16:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 20)
+  %loadf16 = call {<3 x half>, i1} @llvm.dx.resource.load.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 20)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferLoad.ll b/llvm/test/CodeGen/DirectX/BufferLoad.ll
index 7f1291b..86e2217 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoad.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoad.ll
@@ -17,8 +17,9 @@ define void @loadv4f32() {
   ; CHECK-NOT: %dx.resource.casthandle
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load0 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0)
+  %data0 = extractvalue {<4 x float>, i1} %load0, 0
 
   ; The extract order depends on the users, so don't enforce that here.
   ; CHECK-DAG: [[VAL0_0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0
@@ -34,8 +35,9 @@ define void @loadv4f32() {
   call void @scalar_user(float %data0_2)
 
   ; CHECK: [[DATA4:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 4, i32 undef)
-  %data4 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load4 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 4)
+  %data4 = extractvalue {<4 x float>, i1} %load4, 0
 
   ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 0
   ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 1
@@ -48,8 +50,9 @@ define void @loadv4f32() {
   call void @vector_user(<4 x float> %data4)
 
   ; CHECK: [[DATA12:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 12, i32 undef)
-  %data12 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load12 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 12)
+  %data12 = extractvalue {<4 x float>, i1} %load12, 0
 
   ; CHECK: [[DATA12_3:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA12]], 3
   %data12_3 = extractelement <4 x float> %data12, i32 3
@@ -70,8 +73,9 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[LOAD:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 %bufindex, i32 undef)
-  %load = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %bufindex)
+  %data = extractvalue {<4 x float>, i1} %load, 0
 
   ; CHECK: [[ALLOCA:%.*]] = alloca [4 x float]
   ; CHECK: [[V0:%.*]] = extractvalue %dx.types.ResRet.f32 [[LOAD]], 0
@@ -89,10 +93,10 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) {
   ;
   ; CHECK: [[PTR:%.*]] = getelementptr inbounds [4 x float], ptr [[ALLOCA]], i32 0, i32 %elemindex
   ; CHECK: [[X:%.*]] = load float, ptr [[PTR]]
-  %data = extractelement <4 x float> %load, i32 %elemindex
+  %x = extractelement <4 x float> %data, i32 %elemindex
 
   ; CHECK: call void @scalar_user(float [[X]])
-  call void @scalar_user(float %data)
+  call void @scalar_user(float %x)
 
   ret void
 }
@@ -105,8 +109,9 @@ define void @loadf32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call float @llvm.dx.resource.load.typedbuffer(
+  %load0 = call {float, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 0)
+  %data0 = extractvalue {float, i1} %load0, 0
 
   ; CHECK: [[VAL0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0
   ; CHECK: call void @scalar_user(float [[VAL0]])
@@ -123,7 +128,7 @@ define void @loadv2f32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <2 x float> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<2 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <2 x float>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -137,7 +142,7 @@ define void @loadv4f32_checkbit() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call {<4 x float>, i1} @llvm.dx.resource.loadchecked.typedbuffer.f32(
+  %data0 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.f32(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0)
 
   ; CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 4
@@ -158,7 +163,7 @@ define void @loadv4i32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x i32> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x i32>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -172,7 +177,7 @@ define void @loadv4f16() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x half> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x half>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -186,7 +191,7 @@ define void @loadv4i16() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i16 @dx.op.bufferLoad.i16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x i16> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x i16>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x i16>, 0, 0, 0) %buffer, i32 0)
 
   ret void
diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll
new file mode 100644
index 0000000..b8a6649
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll
@@ -0,0 +1,24 @@
+; We use llc for this test so that we don't abort after the first error.
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.2-compute"
+
+declare void @v4f64_user(<4 x double>)
+
+; Can't load 64 bit types directly until SM6.3 (byteaddressbuf.Load<int64_t4>)
+; CHECK: error:
+; CHECK-SAME: in function loadv4f64_byte
+; CHECK-SAME: Cannot create RawBufferLoad operation: Invalid overload type
+define void @loadv4f64_byte(i32 %offset) "hlsl.export" {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  %load = call {<4 x double>, i1} @llvm.dx.resource.load.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 0, 0, 0) %buffer, i32 %offset, i32 0)
+  %data = extractvalue {<4 x double>, i1} %load, 0
+
+  call void @v4f64_user(<4 x double> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll
new file mode 100644
index 0000000..586b9c4
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll
@@ -0,0 +1,232 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+declare void @f32_user(float)
+declare void @v4f32_user(<4 x float>)
+declare void @i32_user(i32)
+declare void @v4i32_user(<4 x i32>)
+declare void @v3f16_user(<3 x half>)
+declare void @v4f64_user(<4 x double>)
+
+; CHECK-LABEL: define void @loadf32_struct
+define void @loadf32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", float, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 1, i32 4)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+          target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %data = extractvalue {float, i1} %load, 0
+
+  ; CHECK: [[VAL:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: call void @f32_user(float [[VAL]])
+  call void @f32_user(float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadf32_byte
+define void @loadf32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 1, i32 4)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+  %data = extractvalue {float, i1} %load, 0
+
+  ; CHECK: [[VAL:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: call void @f32_user(float [[VAL]])
+  call void @f32_user(float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_struct
+define void @loadv4f32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", <4 x float>, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 15, i32 4)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_v4f32_0_0_0t(
+          target("dx.RawBuffer", <4 x float>, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %data = extractvalue {<4 x float>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_byte
+define void @loadv4f32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 15, i32 4)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+  %data = extractvalue {<4 x float>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadelements
+define void @loadelements(i32 %index) {
+  %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 15, i32 4)
+  %loadf32 = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.v4f32(
+          target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %dataf32 = extractvalue {<4 x float>, i1} %loadf32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %dataf32)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 1, i8 15, i32 4)
+  %loadi32 = call {<4 x i32>, i1}
+      @llvm.dx.resource.load.rawbuffer.v4i32(
+          target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 1)
+  %datai32 = extractvalue {<4 x i32>, i1} %loadi32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 3
+  ; CHECK: insertelement <4 x i32> undef
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: call void @v4i32_user(<4 x i32>
+  call void @v4i32_user(<4 x i32> %datai32)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadnested
+define void @loadnested(i32 %index) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 1, i32 4)
+  %loadi32 = call {i32, i1} @llvm.dx.resource.load.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 0)
+  %datai32 = extractvalue {i32, i1} %loadi32, 0
+
+  ; CHECK: [[VALI32:%.*]] = extractvalue %dx.types.ResRet.i32 [[DATAI32]], 0
+  ; CHECK: call void @i32_user(i32 [[VALI32]])
+  call void @i32_user(i32 %datai32)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 4, i8 15, i32 4)
+  %loadf32 = call {<4 x float>, i1} @llvm.dx.resource.load.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 4)
+  %dataf32 = extractvalue {<4 x float>, i1} %loadf32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %dataf32)
+
+  ; CHECK: [[DATAF16:%.*]] = call %dx.types.ResRet.f16 @dx.op.rawBufferLoad.f16(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 20, i8 7, i32 2)
+  %loadf16 = call {<3 x half>, i1} @llvm.dx.resource.load.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 20)
+  %dataf16 = extractvalue {<3 x half>, i1} %loadf16, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 2
+  ; CHECK: insertelement <3 x half> undef
+  ; CHECK: insertelement <3 x half>
+  ; CHECK: insertelement <3 x half>
+  ; CHECK: call void @v3f16_user(<3 x half>
+  call void @v3f16_user(<3 x half> %dataf16)
+
+  ret void
+}
+
+; byteaddressbuf.Load<int64_t4>
+; CHECK-LABEL: define void @loadv4f64_byte
+define void @loadv4f64_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f64 @dx.op.rawBufferLoad.f64(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 15, i32 8)
+  %load = call {<4 x double>, i1} @llvm.dx.resource.load.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 0, 0, 0) %buffer, i32 %offset, i32 0)
+  %data = extractvalue {<4 x double>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 3
+  ; CHECK: insertelement <4 x double> undef
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: call void @v4f64_user(<4 x double>
+  call void @v4f64_user(<4 x double> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
index 9b7e7fd..8769e6e 100644
--- a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
@@ -15,17 +15,19 @@ define void @load_float4(i32 %index, i32 %elemindex) {
   %ptr = call ptr @llvm.dx.resource.getpointer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   %vec_data = load <4 x float>, ptr %ptr
   call void @use_float4(<4 x float> %vec_data)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
   ; CHECK: extractelement <4 x float> %[[VALUE]], i32 1
   %y_ptr = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 1
   %y_data = load float, ptr %y_ptr
   call void @use_float(float %y_data)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
   ; CHECK: extractelement <4 x float> %[[VALUE]], i32 %elemindex
   %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex
   %dyndata = load float, ptr %dynamic
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
index 1760640..0b7882a 100644
--- a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
@@ -18,21 +18,24 @@ define void @store_float4(<4 x float> %data, i32 %index, i32 %elemindex) {
 
   ; Store just the .x component
   %scalar = extractelement <4 x float> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   store float %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 4
   store float %scalar, ptr %y_ptr
 
   ; Store to one of the elements dynamically
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 %elemindex
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 %elemindex
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex
   store float %scalar, ptr %dynamic
@@ -56,14 +59,16 @@ define void @store_half4(<4 x half> %data, i32 %index) {
 
   ; Store just the .x component
   %scalar = extractelement <4 x half> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <4 x half>, i1 } @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x half>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[VEC]], half %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
   store half %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <4 x half>, i1 } @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x half>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[VEC]], half %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 2
   store half %scalar, ptr %y_ptr
@@ -87,14 +92,16 @@ define void @store_double2(<2 x double> %data, i32 %index) {
 
   ; Store just the .x component
   %scalar = extractelement <2 x double> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <2 x double>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[VEC]], double %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
   store double %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <2 x double>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[VEC]], double %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 8
   store double %scalar, ptr %y_ptr
diff --git a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
index c837b36..cd21adc 100644
--- a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
@@ -29,18 +29,20 @@ entry:
   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
   ; CHECK-NOT: load {{.*}} ptr @In
   %1 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  ; CSE: call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
-  %2 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
+  ; CSE: call noundef { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
+  %load = call noundef {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
+  %2 = extractvalue {<4 x float>, i1} %load, 0
   ; CHECK-NOT: load {{.*}} ptr @In
   %3 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  %4 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
+  %load2 = call noundef {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
+  %4 = extractvalue {<4 x float>, i1} %load2, 0
   %add.i = fadd <4 x float> %2, %4
   call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i)
   ; CHECK: ret void
   ret void
 }
 
-; CSE-DAG: declare <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
+; CSE-DAG: declare { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
 ; CSE-DAG: declare void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
 
 attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="8,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
index 2622335..060d54f 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
@@ -17,8 +17,9 @@ target triple = "dxil-pc-shadermodel6.7-library"
 define <4 x float> @multicomponent() #0 {
   %res = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %val = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %res, i32 0)
+  %val = extractvalue {<4 x float>, i1} %load, 0
   ret <4 x float> %val
 }
 
@@ -26,8 +27,9 @@ define <4 x float> @multicomponent() #0 {
 define float @onecomponent() #0 {
   %res = call target("dx.TypedBuffer", float, 1, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %val = call float @llvm.dx.resource.load.typedbuffer(
+  %load = call {float, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", float, 1, 0, 0) %res, i32 0)
+  %val = extractvalue {float, i1} %load, 0
   ret float %val
 }
 
diff --git a/llvm/test/CodeGen/Hexagon/loopIdiom.ll b/llvm/test/CodeGen/Hexagon/loopIdiom.ll
new file mode 100644
index 0000000..9c3df67
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loopIdiom.ll
@@ -0,0 +1,75 @@
+; RUN: opt -debug -S -march=hexagon -O2  < %s | FileCheck %s
+; REQUIRES: asserts
+; CHECK: define dso_local void @complexMultAccum
+target triple = "hexagon"
+
+; Function Attrs: noinline nounwind
+define dso_local void @complexMultAccum(i32 noundef %n) #0 {
+entry:
+  %n.addr = alloca i32, align 4
+  %run_c_code = alloca i8, align 1
+  %run_asm_code = alloca i8, align 1
+  %iOutter = alloca i32, align 4
+  %iOutter1 = alloca i32, align 4
+  store i32 %n, ptr %n.addr, align 4
+  store i8 1, ptr %run_c_code, align 1
+  store i8 0, ptr %run_asm_code, align 1
+  %0 = load i8, ptr %run_c_code, align 1
+  %tobool = icmp ne i8 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 0, ptr %iOutter, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.then
+  %1 = load i32, ptr %iOutter, align 4
+  %cmp = icmp slt i32 %1, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32, ptr %iOutter, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, ptr %iOutter, align 4
+  br label %for.cond, !llvm.loop !3
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, ptr %iOutter1, align 4
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc5, %for.end
+  %3 = load i32, ptr %iOutter1, align 4
+  %cmp3 = icmp slt i32 %3, 2
+  br i1 %cmp3, label %for.body4, label %for.end7
+
+for.body4:                                        ; preds = %for.cond2
+  br label %for.inc5
+
+for.inc5:                                         ; preds = %for.body4
+  %4 = load i32, ptr %iOutter1, align 4
+  %inc6 = add nsw i32 %4, 1
+  store i32 %inc6, ptr %iOutter1, align 4
+  br label %for.cond2, !llvm.loop !5
+
+for.end7:                                         ; preds = %for.cond2
+  br label %if.end
+
+if.end:                                           ; preds = %for.end7, %entry
+  ret void
+}
+
+attributes #0 = { noinline nounwind "approx-func-fp-math"="true" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv79" "target-features"="+v79,-long-calls" "unsafe-fp-math"="true" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"LLVM Clang"}
+!3 = distinct !{!3, !4}
+!4 = !{!"llvm.loop.mustprogress"}
+!5 = distinct !{!5, !4}
+
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index fc0c7ad..e0a93e3 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -315,9 +315,9 @@ define void @test_la_tls_le(i32 signext %n) {
 ; LA32-LABEL: test_la_tls_le:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    move $a1, $zero
-; LA32-NEXT:    lu12i.w $a2, %le_hi20(le)
-; LA32-NEXT:    ori $a2, $a2, %le_lo12(le)
-; LA32-NEXT:    add.w $a2, $a2, $tp
+; LA32-NEXT:    lu12i.w $a2, %le_hi20_r(le)
+; LA32-NEXT:    add.w $a2, $a2, $tp, %le_add_r(le)
+; LA32-NEXT:    addi.w $a2, $a2, %le_lo12_r(le)
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB4_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -330,12 +330,13 @@ define void @test_la_tls_le(i32 signext %n) {
 ; LA64-LABEL: test_la_tls_le:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    move $a1, $zero
-; LA64-NEXT:    lu12i.w $a2, %le_hi20(le)
-; LA64-NEXT:    ori $a2, $a2, %le_lo12(le)
+; LA64-NEXT:    lu12i.w $a2, %le_hi20_r(le)
+; LA64-NEXT:    add.d $a2, $a2, $tp, %le_add_r(le)
+; LA64-NEXT:    addi.d $a2, $a2, %le_lo12_r(le)
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB4_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ldx.w $zero, $a2, $tp
+; LA64-NEXT:    ld.w $zero, $a2, 0
 ; LA64-NEXT:    addi.w $a1, $a1, 1
 ; LA64-NEXT:    blt $a1, $a0, .LBB4_1
 ; LA64-NEXT:  # %bb.2: # %ret
diff --git a/llvm/test/CodeGen/LoongArch/mir-target-flags.ll b/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
index f530e3e..3bc8a8d 100644
--- a/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
+++ b/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
@@ -28,8 +28,9 @@ define void @caller() nounwind {
 ; CHECK-NEXT: target-flags(loongarch-got-pc-lo) @t_ld
 ; CHECK:      target-flags(loongarch-ie-pc-hi) @t_ie
 ; CHECK-NEXT: target-flags(loongarch-ie-pc-lo) @t_ie
-; CHECK:      target-flags(loongarch-le-hi) @t_le
-; CHECK-NEXT: target-flags(loongarch-le-lo) @t_le
+; CHECK:      target-flags(loongarch-le-hi-r) @t_le
+; CHECK-NEXT: target-flags(loongarch-le-add-r) @t_le
+; CHECK-NEXT: target-flags(loongarch-le-lo-r) @t_le
 ; CHECK:      target-flags(loongarch-call-plt) @callee1
 ; CHECK:      target-flags(loongarch-call) @callee2
   %a = load volatile i32, ptr @g_e
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index c7de3dc..3390f7f 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=0 < %s \
 ; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=1 < %s \
@@ -7,6 +6,14 @@
 ; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --post-RA-scheduler=1 < %s \
 ; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --enable-tlsdesc \
+; RUN:     --post-RA-scheduler=0 < %s | FileCheck %s --check-prefix=MEDIUMDESC_NO_SCH
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --enable-tlsdesc \
+; RUN:     --post-RA-scheduler=1 < %s | FileCheck %s --check-prefix=MEDIUMDESC_SCH
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --enable-tlsdesc \
+; RUN:     --post-RA-scheduler=0 < %s | FileCheck %s --check-prefix=LARGEDESC_NO_SCH
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --enable-tlsdesc \
+; RUN:     --post-RA-scheduler=1 < %s | FileCheck %s --check-prefix=LARGEDESC_SCH
 
 @g = dso_local global i64 zeroinitializer, align 4
 @G = global i64 zeroinitializer, align 4
@@ -194,3 +201,69 @@ define void @foo() nounwind {
   %v_ie = load volatile i64, ptr @ie
   ret void
 }
+
+define void @baz() nounwind {
+; MEDIUMDESC_NO_SCH-LABEL: baz:
+; MEDIUMDESC_NO_SCH:       # %bb.0:
+; MEDIUMDESC_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUMDESC_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUMDESC_NO_SCH-NEXT:    pcalau12i $a0, %desc_pc_hi20(gd)
+; MEDIUMDESC_NO_SCH-NEXT:    addi.d $a0, $a0, %desc_pc_lo12(gd)
+; MEDIUMDESC_NO_SCH-NEXT:    ld.d $ra, $a0, %desc_ld(gd)
+; MEDIUMDESC_NO_SCH-NEXT:    jirl $ra, $ra, %desc_call(gd)
+; MEDIUMDESC_NO_SCH-NEXT:    add.d $a0, $a0, $tp
+; MEDIUMDESC_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUMDESC_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUMDESC_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUMDESC_NO_SCH-NEXT:    ret
+;
+; MEDIUMDESC_SCH-LABEL: baz:
+; MEDIUMDESC_SCH:       # %bb.0:
+; MEDIUMDESC_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUMDESC_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUMDESC_SCH-NEXT:    pcalau12i $a0, %desc_pc_hi20(gd)
+; MEDIUMDESC_SCH-NEXT:    addi.d $a0, $a0, %desc_pc_lo12(gd)
+; MEDIUMDESC_SCH-NEXT:    ld.d $ra, $a0, %desc_ld(gd)
+; MEDIUMDESC_SCH-NEXT:    jirl $ra, $ra, %desc_call(gd)
+; MEDIUMDESC_SCH-NEXT:    add.d $a0, $a0, $tp
+; MEDIUMDESC_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUMDESC_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUMDESC_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUMDESC_SCH-NEXT:    ret
+;
+; LARGEDESC_NO_SCH-LABEL: baz:
+; LARGEDESC_NO_SCH:       # %bb.0:
+; LARGEDESC_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGEDESC_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGEDESC_NO_SCH-NEXT:    pcalau12i $a0, %desc_pc_hi20(gd)
+; LARGEDESC_NO_SCH-NEXT:    addi.d $a1, $zero, %desc_pc_lo12(gd)
+; LARGEDESC_NO_SCH-NEXT:    lu32i.d $a1, %desc64_pc_lo20(gd)
+; LARGEDESC_NO_SCH-NEXT:    lu52i.d $a1, $a1, %desc64_pc_hi12(gd)
+; LARGEDESC_NO_SCH-NEXT:    add.d $a0, $a0, $a1
+; LARGEDESC_NO_SCH-NEXT:    ld.d $ra, $a0, %desc_ld(gd)
+; LARGEDESC_NO_SCH-NEXT:    jirl $ra, $ra, %desc_call(gd)
+; LARGEDESC_NO_SCH-NEXT:    add.d $a0, $a0, $tp
+; LARGEDESC_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGEDESC_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGEDESC_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGEDESC_NO_SCH-NEXT:    ret
+;
+; LARGEDESC_SCH-LABEL: baz:
+; LARGEDESC_SCH:       # %bb.0:
+; LARGEDESC_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGEDESC_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGEDESC_SCH-NEXT:    pcalau12i $a0, %desc_pc_hi20(gd)
+; LARGEDESC_SCH-NEXT:    addi.d $a1, $zero, %desc_pc_lo12(gd)
+; LARGEDESC_SCH-NEXT:    lu32i.d $a1, %desc64_pc_lo20(gd)
+; LARGEDESC_SCH-NEXT:    lu52i.d $a1, $a1, %desc64_pc_hi12(gd)
+; LARGEDESC_SCH-NEXT:    add.d $a0, $a0, $a1
+; LARGEDESC_SCH-NEXT:    ld.d $ra, $a0, %desc_ld(gd)
+; LARGEDESC_SCH-NEXT:    jirl $ra, $ra, %desc_call(gd)
+; LARGEDESC_SCH-NEXT:    add.d $a0, $a0, $tp
+; LARGEDESC_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGEDESC_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGEDESC_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGEDESC_SCH-NEXT:    ret
+  %v_gd = load volatile i64, ptr @gd
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index dbd7bf6..e3a8ace 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -330,16 +330,16 @@ entry:
 define ptr @f4() nounwind {
 ; LA32PIC-LABEL: f4:
 ; LA32PIC:       # %bb.0: # %entry
-; LA32PIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32PIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32PIC-NEXT:    add.w $a0, $a0, $tp
+; LA32PIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32PIC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32PIC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32PIC-NEXT:    ret
 ;
 ; LA64PIC-LABEL: f4:
 ; LA64PIC:       # %bb.0: # %entry
-; LA64PIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64PIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64PIC-NEXT:    add.d $a0, $a0, $tp
+; LA64PIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64PIC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64PIC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64PIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: f4:
@@ -353,16 +353,16 @@ define ptr @f4() nounwind {
 ;
 ; LA32NOPIC-LABEL: f4:
 ; LA32NOPIC:       # %bb.0: # %entry
-; LA32NOPIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32NOPIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32NOPIC-NEXT:    add.w $a0, $a0, $tp
+; LA32NOPIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32NOPIC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32NOPIC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32NOPIC-NEXT:    ret
 ;
 ; LA64NOPIC-LABEL: f4:
 ; LA64NOPIC:       # %bb.0: # %entry
-; LA64NOPIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64NOPIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64NOPIC-NEXT:    add.d $a0, $a0, $tp
+; LA64NOPIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64NOPIC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64NOPIC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64NOPIC-NEXT:    ret
 ;
 ; LA64LARGENOPIC-LABEL: f4:
@@ -376,16 +376,16 @@ define ptr @f4() nounwind {
 ;
 ; LA32DESC-LABEL: f4:
 ; LA32DESC:       # %bb.0: # %entry
-; LA32DESC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32DESC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32DESC-NEXT:    add.w $a0, $a0, $tp
+; LA32DESC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32DESC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32DESC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32DESC-NEXT:    ret
 ;
 ; LA64DESC-LABEL: f4:
 ; LA64DESC:       # %bb.0: # %entry
-; LA64DESC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64DESC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64DESC-NEXT:    add.d $a0, $a0, $tp
+; LA64DESC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64DESC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64DESC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64DESC-NEXT:    ret
 ;
 ; DESC64-LABEL: f4:
diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
index d2b063a..b2abff7 100644
--- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
@@ -25,8 +25,8 @@ alignment:       4
 tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
-#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
+#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
   bb.0.entry:
@@ -42,8 +42,8 @@ tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
 #CHECK:    frame-setup EMITBKEY
-#CHECK:    frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
+#CHECK:    frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-destroy AUTIBSP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
   bb.0.entry:
@@ -59,8 +59,8 @@ tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
 #CHECK:    frame-setup PACM
-#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp, pre-instr-symbol <mcsymbol >
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state_with_pc
+#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp, pre-instr-symbol <mcsymbol >
 #CHECK:    frame-destroy PACM
 #CHECK:    frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll
index 5d1c390..b6317df 100644
--- a/llvm/test/CodeGen/NVPTX/b52037.ll
+++ b/llvm/test/CodeGen/NVPTX/b52037.ll
@@ -39,7 +39,7 @@ declare %int3 @hoge(i32, i32, i32) local_unnamed_addr
 
 declare i64 @foo() local_unnamed_addr
 
-define void @barney(ptr nocapture readonly %arg) local_unnamed_addr {
+define ptx_kernel void @barney(ptr nocapture readonly %arg) local_unnamed_addr {
 bb:
   tail call void asm sideeffect "// KEEP", ""() #1
   %tmp = alloca %struct.zot, align 16
@@ -210,9 +210,6 @@ bb14:                                             ; preds = %bb49.i.lr.ph, %bb49
 attributes #0 = { argmemonly mustprogress nofree nounwind willreturn }
 attributes #1 = { nounwind }
 
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @barney, !"kernel", i32 1}
 !1 = !{!2, !11, i64 64}
 !2 = !{!"_ZTSN7cuneibs22neiblist_iterator_coreE", !3, i64 0, !3, i64 8, !6, i64 16, !8, i64 32, !9, i64 44, !10, i64 48, !11, i64 64, !9, i64 72, !4, i64 76, !9, i64 80}
 !3 = !{!"any pointer", !4, i64 0}
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 03cdeb9..8be3a66 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -182,8 +182,8 @@ define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fneg_param_0];
-; CHECK-NEXT:    xor.b32 %r2, %r1, -2147450880;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
+; CHECK-NEXT:    neg.bf16x2 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = fneg <2 x bfloat> %a
@@ -532,8 +532,8 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fabs_param_0];
-; CHECK-NEXT:    and.b32 %r2, %r1, 2147450879;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
+; CHECK-NEXT:    abs.bf16x2 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a)
diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll
index 9b1f104..76300e3 100644
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@@ -8,7 +8,7 @@ target triple = "nvptx64-unknown-unknown"
 %struct.S = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define void @_Z11TakesStruct1SPi(ptr byval(%struct.S) nocapture readonly %input, ptr nocapture %output) #0 {
+define ptx_kernel void @_Z11TakesStruct1SPi(ptr byval(%struct.S) nocapture readonly %input, ptr nocapture %output) #0 {
 entry:
 ; CHECK-LABEL: @_Z11TakesStruct1SPi
 ; PTX-LABEL: .visible .entry _Z11TakesStruct1SPi(
@@ -23,7 +23,3 @@ entry:
 }
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @_Z11TakesStruct1SPi, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index e3656fd..ace3166 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -8,7 +8,7 @@ target triple = "nvptx64-nvidia-cuda"
 
 ; Function Attrs: nounwind
 ; CHECK-LABEL: some_kernel
-define void @some_kernel(ptr nocapture %dst) #0 {
+define ptx_kernel void @some_kernel(ptr nocapture %dst) #0 {
 _ZL11compute_vecRK6float3jb.exit:
   %ret_vec.sroa.8.i = alloca float, align 4
   %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
@@ -55,8 +55,5 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "n
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
-!nvvm.annotations = !{!0}
 !llvm.ident = !{!1}
-
-!0 = !{ptr @some_kernel, !"kernel", i32 1}
 !1 = !{!"clang version 3.5.1 (tags/RELEASE_351/final)"}
diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 00c97fb..193df7f 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -8,7 +8,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
 ; CHECK-LABEL: ex_zext
-define void @ex_zext(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.u8
   %val = load i8, ptr %data
@@ -19,7 +19,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_sext
-define void @ex_sext(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.u8
   %val = load i8, ptr %data
@@ -30,7 +30,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_zext_v2
-define void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.v2.u8
   %val = load <2 x i8>, ptr %data
@@ -41,7 +41,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_sext_v2
-define void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.v2.u8
   %val = load <2 x i8>, ptr %data
@@ -51,8 +51,3 @@ entry:
   ret void
 }
 
-!nvvm.annotations = !{!0,!1,!2,!3}
-!0 = !{ptr @ex_zext, !"kernel", i32 1}
-!1 = !{ptr @ex_sext, !"kernel", i32 1}
-!2 = !{ptr @ex_zext_v2, !"kernel", i32 1}
-!3 = !{ptr @ex_sext_v2, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 19f4ef8..1c9d271 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -16,7 +16,7 @@
 ;  }
 
 ; CHECK: .visible .entry kernel_func
-define void @kernel_func(ptr %a) {
+define ptx_kernel void @kernel_func(ptr %a) {
 entry:
   %buf = alloca [16 x i8], align 4
 
@@ -56,7 +56,3 @@ entry:
 }
 
 declare void @callee(ptr, ptr)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/cluster-dim.ll b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
index c9258ad..9275c89 100644
--- a/llvm/test/CodeGen/NVPTX/cluster-dim.ll
+++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s
 ; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
 
-define void @kernel_func_clusterxyz() {
+define ptx_kernel void @kernel_func_clusterxyz() {
 ; CHECK80-LABEL: kernel_func_clusterxyz(
 ; CHECK80:       {
 ; CHECK80-EMPTY:
@@ -23,7 +23,6 @@ define void @kernel_func_clusterxyz() {
 }
 
 
-!nvvm.annotations = !{!1, !2}
+!nvvm.annotations = !{!1}
 
-!1 = !{ptr @kernel_func_clusterxyz, !"kernel", i32 1}
-!2 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
+!1 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
diff --git a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
new file mode 100644
index 0000000..1b1bb91
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+@a = external global ptr align 16
+
+define i32  @test_disjoint_or_addr(i16 %a) {
+; CHECK-LABEL: test_disjoint_or_addr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u64 %rd1, a;
+; CHECK-NEXT:    cvta.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.u32 %r1, [%rd2+8];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a1 = ptrtoint ptr @a to i64
+  %a2 = or disjoint i64 %a1, 8
+  %a3 = inttoptr i64 %a2 to ptr
+  %v = load i32, ptr %a3
+  ret i32 %v
+}
diff --git a/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll b/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll
new file mode 100644
index 0000000..9031f33
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+define float @fabs_free(i32 %in) {
+; CHECK-LABEL: fabs_free(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fabs_free_param_0];
+; CHECK-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %b = bitcast i32 %in to float
+  %f = call float @llvm.fabs.f32(float %b)
+  ret float %f
+}
+
+define float @fneg_free(i32 %in) {
+; CHECK-LABEL: fneg_free(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fneg_free_param_0];
+; CHECK-NEXT:    neg.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %b = bitcast i32 %in to float
+  %f = fneg float %b
+  ret float %f
+}
diff --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
index 43e4dfc..2b66311 100644
--- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -12,7 +12,7 @@ target triple = "nvptx-nvidia-cuda"
 @myconst = internal constant i32 420, align 4
 
 
-define void @foo(ptr %a, ptr %b) {
+define ptx_kernel void @foo(ptr %a, ptr %b) {
 ; Expect one load -- @myconst isn't loaded from, because we know its value
 ; statically.
 ; CHECK: ld.global.u32
@@ -24,7 +24,3 @@ define void @foo(ptr %a, ptr %b) {
   store i32 %ld2, ptr %b
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-array-global.ll b/llvm/test/CodeGen/NVPTX/i1-array-global.ll
index ff3848b..20b376f 100644
--- a/llvm/test/CodeGen/NVPTX/i1-array-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-array-global.ll
@@ -7,13 +7,9 @@ target triple = "nvptx-nvidia-cuda"
 @global_cst = private constant [6 x i1] [i1 true, i1 false, i1 true, i1 false, i1 true, i1 false]
 
 ; CHECK: .global .align 1 .b8 global_cst[6] = {1, 0, 1, 0, 1}
-define void @kernel(i32 %i, ptr %out) {
+define ptx_kernel void @kernel(i32 %i, ptr %out) {
   %5 = getelementptr inbounds i1, ptr @global_cst, i32 %i
   %6 = load i1, ptr %5, align 1
   store i1 %6, ptr %out, align 1
   ret void
 }
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @kernel, !"kernel", i32 1}
-
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index 83f8f80..f5f1dd9 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -5,7 +5,7 @@
 
 target triple = "nvptx-nvidia-cuda"
 
-define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
+define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK-LABEL: foo(
 ; CHECK:    .reg .b16 %rs<2>;
 ; CHECK:    .reg .b32 %r<4>;
@@ -28,7 +28,3 @@ define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
   store i32 %and, ptr %retval
   ret void
 }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-global.ll b/llvm/test/CodeGen/NVPTX/i1-global.ll
index 17af1fa..60d2ccd 100644
--- a/llvm/test/CodeGen/NVPTX/i1-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-global.ll
@@ -8,13 +8,9 @@ target triple = "nvptx-nvidia-cuda"
 @mypred = addrspace(1) global i1 true, align 1
 
 
-define void @foo(i1 %p, ptr %out) {
+define ptx_kernel void @foo(i1 %p, ptr %out) {
   %ld = load i1, ptr addrspace(1) @mypred
   %val = zext i1 %ld to i32
   store i32 %val, ptr %out
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-param.ll b/llvm/test/CodeGen/NVPTX/i1-param.ll
index 3c74ee6..14d417b 100644
--- a/llvm/test/CodeGen/NVPTX/i1-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-param.ll
@@ -9,12 +9,8 @@ target triple = "nvptx-nvidia-cuda"
 ; CHECK: .entry foo
 ; CHECK:   .param .u8 foo_param_0
 ; CHECK:   .param .u64 .ptr .align 1 foo_param_1
-define void @foo(i1 %p, ptr %out) {
+define ptx_kernel void @foo(i1 %p, ptr %out) {
   %val = zext i1 %p to i32
   store i32 %val, ptr %out
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll
index 2f3e08a..86776ab 100644
--- a/llvm/test/CodeGen/NVPTX/intr-range.ll
+++ b/llvm/test/CodeGen/NVPTX/intr-range.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
 
-define i32 @test_maxntid() {
-; CHECK-LABEL: define i32 @test_maxntid(
+define ptx_kernel i32 @test_maxntid() {
+; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -31,8 +31,8 @@ define i32 @test_maxntid() {
   ret i32 %11
 }
 
-define i32 @test_reqntid() {
-; CHECK-LABEL: define i32 @test_reqntid(
+define ptx_kernel i32 @test_reqntid() {
+; CHECK-LABEL: define ptx_kernel i32 @test_reqntid(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -64,8 +64,8 @@ define i32 @test_reqntid() {
 ;; A case like this could occur if a function with the sreg intrinsic was
 ;; inlined into a kernel where the tid metadata is present, ensure the range is
 ;; updated.
-define i32 @test_inlined() {
-; CHECK-LABEL: define i32 @test_inlined(
+define ptx_kernel i32 @test_inlined() {
+; CHECK-LABEL: define ptx_kernel i32 @test_inlined(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    ret i32 [[TMP1]]
@@ -83,6 +83,6 @@ declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
 
 !nvvm.annotations = !{!0, !1, !2}
-!0 = !{ptr @test_maxntid, !"kernel", i32 1, !"maxntidx", i32 32, !"maxntidz", i32 3}
-!1 = !{ptr @test_reqntid, !"kernel", i32 1, !"reqntidx", i32 20}
-!2 = !{ptr @test_inlined, !"kernel", i32 1, !"maxntidx", i32 4}
+!0 = !{ptr @test_maxntid, !"maxntidx", i32 32, !"maxntidz", i32 3}
+!1 = !{ptr @test_reqntid, !"reqntidx", i32 20}
+!2 = !{ptr @test_inlined, !"maxntidx", i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
index 93d428d..2889d2d 100644
--- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
@@ -10,7 +10,7 @@
 ; CHECK: .param .u64 .ptr .shared .align 8  func_align_param_3
 ; CHECK: .param .u64 .ptr .const  .align 16 func_align_param_4
 ; CHECK: .param .u64 .ptr .local  .align 32 func_align_param_5
-define void @func_align(ptr nocapture readonly align 1 %input,
+define ptx_kernel void @func_align(ptr nocapture readonly align 1 %input,
                         ptr nocapture align 2 %out,
                         ptr addrspace(1) align 4 %global,
                         ptr addrspace(3) align 8 %shared,
@@ -27,7 +27,7 @@ entry:
 ; CHECK: .param .u64 .ptr .shared .align 1 func_noalign_param_3
 ; CHECK: .param .u64 .ptr .const  .align 1 func_noalign_param_4
 ; CHECK: .param .u64 .ptr .local  .align 1 func_noalign_param_5
-define void @func_noalign(ptr nocapture readonly %input,
+define ptx_kernel void @func_noalign(ptr nocapture readonly %input,
                           ptr nocapture %out,
                           ptr addrspace(1) %global,
                           ptr addrspace(3) %shared,
@@ -36,7 +36,3 @@ define void @func_noalign(ptr nocapture readonly %input,
 entry:
   ret void
 }
-
-!nvvm.annotations = !{!0, !1}
-!0 = !{ptr @func_align, !"kernel", i32 1}
-!1 = !{ptr @func_noalign, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index bdaeccd5..dc1917f 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -10,7 +10,7 @@ target triple = "nvptx64-unknown-unknown"
 ; SM20: ld.global.f32
 ; SM35-LABEL: .visible .entry foo1(
 ; SM35: ld.global.nc.f32
-define void @foo1(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) {
   %1 = load float, ptr %from
   store float %1, ptr %to
   ret void
@@ -20,7 +20,7 @@ define void @foo1(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.f64
 ; SM35-LABEL: .visible .entry foo2(
 ; SM35: ld.global.nc.f64
-define void @foo2(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) {
   %1 = load double, ptr %from
   store double %1, ptr %to
   ret void
@@ -30,7 +30,7 @@ define void @foo2(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u16
 ; SM35-LABEL: .visible .entry foo3(
 ; SM35: ld.global.nc.u16
-define void @foo3(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) {
   %1 = load i16, ptr %from
   store i16 %1, ptr %to
   ret void
@@ -40,7 +40,7 @@ define void @foo3(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo4(
 ; SM35: ld.global.nc.u32
-define void @foo4(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) {
   %1 = load i32, ptr %from
   store i32 %1, ptr %to
   ret void
@@ -50,7 +50,7 @@ define void @foo4(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u64
 ; SM35-LABEL: .visible .entry foo5(
 ; SM35: ld.global.nc.u64
-define void @foo5(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) {
   %1 = load i64, ptr %from
   store i64 %1, ptr %to
   ret void
@@ -63,7 +63,7 @@ define void @foo5(ptr noalias readonly %from, ptr %to) {
 ; SM35-LABEL: .visible .entry foo6(
 ; SM35: ld.global.nc.u64
 ; SM35: ld.global.nc.u64
-define void @foo6(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) {
   %1 = load i128, ptr %from
   store i128 %1, ptr %to
   ret void
@@ -73,7 +73,7 @@ define void @foo6(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u8
 ; SM35-LABEL: .visible .entry foo7(
 ; SM35: ld.global.nc.v2.u8
-define void @foo7(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i8>, ptr %from
   store <2 x i8> %1, ptr %to
   ret void
@@ -83,7 +83,7 @@ define void @foo7(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo8(
 ; SM35: ld.global.nc.u32
-define void @foo8(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i16>, ptr %from
   store <2 x i16> %1, ptr %to
   ret void
@@ -93,7 +93,7 @@ define void @foo8(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u32
 ; SM35-LABEL: .visible .entry foo9(
 ; SM35: ld.global.nc.v2.u32
-define void @foo9(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i32>, ptr %from
   store <2 x i32> %1, ptr %to
   ret void
@@ -103,7 +103,7 @@ define void @foo9(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u64
 ; SM35-LABEL: .visible .entry foo10(
 ; SM35: ld.global.nc.v2.u64
-define void @foo10(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i64>, ptr %from
   store <2 x i64> %1, ptr %to
   ret void
@@ -113,7 +113,7 @@ define void @foo10(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.f32
 ; SM35-LABEL: .visible .entry foo11(
 ; SM35: ld.global.nc.v2.f32
-define void @foo11(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x float>, ptr %from
   store <2 x float> %1, ptr %to
   ret void
@@ -123,7 +123,7 @@ define void @foo11(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.f64
 ; SM35-LABEL: .visible .entry foo12(
 ; SM35: ld.global.nc.v2.f64
-define void @foo12(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x double>, ptr %from
   store <2 x double> %1, ptr %to
   ret void
@@ -133,7 +133,7 @@ define void @foo12(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo13(
 ; SM35: ld.global.nc.u32
-define void @foo13(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i8>, ptr %from
   store <4 x i8> %1, ptr %to
   ret void
@@ -143,7 +143,7 @@ define void @foo13(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.u16
 ; SM35-LABEL: .visible .entry foo14(
 ; SM35: ld.global.nc.v4.u16
-define void @foo14(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i16>, ptr %from
   store <4 x i16> %1, ptr %to
   ret void
@@ -153,7 +153,7 @@ define void @foo14(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.u32
 ; SM35-LABEL: .visible .entry foo15(
 ; SM35: ld.global.nc.v4.u32
-define void @foo15(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i32>, ptr %from
   store <4 x i32> %1, ptr %to
   ret void
@@ -163,7 +163,7 @@ define void @foo15(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.f32
 ; SM35-LABEL: .visible .entry foo16(
 ; SM35: ld.global.nc.v4.f32
-define void @foo16(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x float>, ptr %from
   store <4 x float> %1, ptr %to
   ret void
@@ -175,7 +175,7 @@ define void @foo16(ptr noalias readonly %from, ptr %to) {
 ; SM35-LABEL: .visible .entry foo17(
 ; SM35: ld.global.nc.v2.f64
 ; SM35: ld.global.nc.v2.f64
-define void @foo17(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x double>, ptr %from
   store <4 x double> %1, ptr %to
   ret void
@@ -185,7 +185,7 @@ define void @foo17(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u64
 ; SM35-LABEL: .visible .entry foo18(
 ; SM35: ld.global.nc.u64
-define void @foo18(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) {
   %1 = load ptr, ptr %from
   store ptr %1, ptr %to
   ret void
@@ -196,7 +196,7 @@ define void @foo18(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.f32
 ; SM35-LABEL: .visible .entry foo19(
 ; SM35: ld.global.nc.f32
-define void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
+define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
 entry:
   br label %loop
 
@@ -243,24 +243,3 @@ define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) {
   store float %1, ptr %to
   ret void
 }
-
-!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19}
-!1 = !{ptr @foo1, !"kernel", i32 1}
-!2 = !{ptr @foo2, !"kernel", i32 1}
-!3 = !{ptr @foo3, !"kernel", i32 1}
-!4 = !{ptr @foo4, !"kernel", i32 1}
-!5 = !{ptr @foo5, !"kernel", i32 1}
-!6 = !{ptr @foo6, !"kernel", i32 1}
-!7 = !{ptr @foo7, !"kernel", i32 1}
-!8 = !{ptr @foo8, !"kernel", i32 1}
-!9 = !{ptr @foo9, !"kernel", i32 1}
-!10 = !{ptr @foo10, !"kernel", i32 1}
-!11 = !{ptr @foo11, !"kernel", i32 1}
-!12 = !{ptr @foo12, !"kernel", i32 1}
-!13 = !{ptr @foo13, !"kernel", i32 1}
-!14 = !{ptr @foo14, !"kernel", i32 1}
-!15 = !{ptr @foo15, !"kernel", i32 1}
-!16 = !{ptr @foo16, !"kernel", i32 1}
-!17 = !{ptr @foo17, !"kernel", i32 1}
-!18 = !{ptr @foo18, !"kernel", i32 1}
-!19 = !{ptr @foo19, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index e42f230..f21ff97 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -29,7 +29,7 @@ define void @foo(i32 %a) {
 ; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo2_param_0];
 ; PTX64:        add.u64          %rd[[SP_REG:[0-9]+]], %SPL, 0;
 ; PTX64:        st.local.u32  [%rd[[SP_REG]]], %r{{[0-9]+}};
-define void @foo2(i32 %a) {
+define ptx_kernel void @foo2(i32 %a) {
   %local = alloca i32, align 4
   store i32 %a, ptr %local
   call void @bar(ptr %local)
@@ -38,8 +38,6 @@ define void @foo2(i32 %a) {
 
 declare void @bar(ptr %a)
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo2, !"kernel", i32 1}
 
 ; PTX32:        mov.u32          %SPL, __local_depot{{[0-9]+}};
 ; PTX32-NOT:    cvta.local.u32   %SP, %SPL;
diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
index 8f2d551..530b48b 100644
--- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-define void @kernel() {
+define ptx_kernel void @kernel() {
 ; LABEL: @lower_alloca
 ; PTX-LABEL: .visible .entry kernel(
   %A = alloca i32
@@ -37,7 +37,5 @@ define void @alloca_in_explicit_local_as() {
 declare void @callee(ptr)
 declare void @callee_addrspace5(ptr addrspace(5))
 
-!nvvm.annotations = !{!0}
 !nvvm.annotations = !{!1}
-!0 = !{ptr @kernel, !"kernel", i32 1}
 !1 = !{ptr @alloca_in_explicit_local_as, !"alloca_in_explicit_local_as", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 9cfe919..208d4f0 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -29,7 +29,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b16 %rs<3>;
 ; PTX-NEXT:    .reg .b32 %r<11>;
-; PTX-NEXT:    .reg .b64 %rd<10>;
+; PTX-NEXT:    .reg .b64 %rd<9>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.u64 %SPL, __local_depot0;
@@ -38,23 +38,22 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX-NEXT:    setp.eq.b16 %p1, %rs2, 1;
 ; PTX-NEXT:    ld.param.s32 %rd1, [non_kernel_function_param_2];
-; PTX-NEXT:    add.u64 %rd2, %SP, 0;
-; PTX-NEXT:    or.b64 %rd3, %rd2, 8;
-; PTX-NEXT:    ld.param.u64 %rd4, [non_kernel_function_param_0+8];
-; PTX-NEXT:    st.u64 [%rd3], %rd4;
-; PTX-NEXT:    ld.param.u64 %rd5, [non_kernel_function_param_0];
-; PTX-NEXT:    st.u64 [%SP], %rd5;
-; PTX-NEXT:    mov.u64 %rd6, gi;
-; PTX-NEXT:    cvta.global.u64 %rd7, %rd6;
-; PTX-NEXT:    selp.b64 %rd8, %rd2, %rd7, %p1;
-; PTX-NEXT:    add.s64 %rd9, %rd8, %rd1;
-; PTX-NEXT:    ld.u8 %r1, [%rd9];
-; PTX-NEXT:    ld.u8 %r2, [%rd9+1];
+; PTX-NEXT:    ld.param.u64 %rd2, [non_kernel_function_param_0+8];
+; PTX-NEXT:    st.u64 [%SP+8], %rd2;
+; PTX-NEXT:    ld.param.u64 %rd3, [non_kernel_function_param_0];
+; PTX-NEXT:    st.u64 [%SP], %rd3;
+; PTX-NEXT:    mov.u64 %rd4, gi;
+; PTX-NEXT:    cvta.global.u64 %rd5, %rd4;
+; PTX-NEXT:    add.u64 %rd6, %SP, 0;
+; PTX-NEXT:    selp.b64 %rd7, %rd6, %rd5, %p1;
+; PTX-NEXT:    add.s64 %rd8, %rd7, %rd1;
+; PTX-NEXT:    ld.u8 %r1, [%rd8];
+; PTX-NEXT:    ld.u8 %r2, [%rd8+1];
 ; PTX-NEXT:    shl.b32 %r3, %r2, 8;
 ; PTX-NEXT:    or.b32 %r4, %r3, %r1;
-; PTX-NEXT:    ld.u8 %r5, [%rd9+2];
+; PTX-NEXT:    ld.u8 %r5, [%rd8+2];
 ; PTX-NEXT:    shl.b32 %r6, %r5, 16;
-; PTX-NEXT:    ld.u8 %r7, [%rd9+3];
+; PTX-NEXT:    ld.u8 %r7, [%rd8+3];
 ; PTX-NEXT:    shl.b32 %r8, %r7, 24;
 ; PTX-NEXT:    or.b32 %r9, %r8, %r6;
 ; PTX-NEXT:    or.b32 %r10, %r9, %r4;
@@ -68,7 +67,7 @@ entry:
   ret i32 %0, !dbg !23
 }
 
-define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
+define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
 ; PTX-LABEL: grid_const_int(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -82,7 +81,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 ; PTX-NEXT:    add.s32 %r3, %r2, %r1;
 ; PTX-NEXT:    st.global.u32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_int(
+; OPT-LABEL: define ptx_kernel void @grid_const_int(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
@@ -91,6 +90,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
 ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT3]], align 4
 ; OPT-NEXT:    ret void
+;
   %tmp = load i32, ptr %input1, align 4
   %add = add i32 %tmp, %input2
   store i32 %add, ptr %out
@@ -99,7 +99,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 
 %struct.s = type { i32, i32 }
 
-define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
+define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; PTX-LABEL: grid_const_struct(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -113,7 +113,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
 ; PTX-NEXT:    st.global.u32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_struct(
+; OPT-LABEL: define ptx_kernel void @grid_const_struct(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
@@ -125,6 +125,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
 ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT5]], align 4
 ; OPT-NEXT:    ret void
+;
   %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %int1 = load i32, ptr %gep1
@@ -134,7 +135,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
   ret void
 }
 
-define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
+define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-LABEL: grid_const_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<3>;
@@ -159,17 +160,18 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
 ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
 ; OPT-NEXT:    ret void
+;
   %call = call i32 @escape(ptr %input)
   ret void
 }
 
-define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
+define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
 ; PTX-LABEL: multiple_grid_const_escape(
 ; PTX:       {
 ; PTX-NEXT:    .local .align 4 .b8 __local_depot4[4];
@@ -212,7 +214,7 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 ; PTX-NEXT:    ld.param.b32 %r2, [retval0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @multiple_grid_const_escape(
+; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
 ; OPT-NEXT:    [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
@@ -222,13 +224,14 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 ; OPT-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
 ; OPT-NEXT:    ret void
+;
   %a.addr = alloca i32, align 4
   store i32 %a, ptr %a.addr, align 4
   %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
   ret void
 }
 
-define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
+define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
 ; PTX-LABEL: grid_const_memory_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b64 %rd<6>;
@@ -241,7 +244,7 @@ define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %
 ; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
 ; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_memory_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
 ; OPT-NEXT:    [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
@@ -249,11 +252,12 @@ define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %
 ; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
 ; OPT-NEXT:    store ptr [[INPUT1]], ptr [[ADDR5]], align 8
 ; OPT-NEXT:    ret void
+;
   store ptr %input, ptr %addr, align 8
   ret void
 }
 
-define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
+define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
 ; PTX-LABEL: grid_const_inlineasm_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b64 %rd<8>;
@@ -271,7 +275,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
 ; PTX-NEXT:    st.global.u64 [%rd6], %rd1;
 ; PTX-NEXT:    ret;
 ; PTX-NOT      .local
-; OPT-LABEL: define void @grid_const_inlineasm_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
@@ -282,6 +286,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
 ; OPT-NEXT:    [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
 ; OPT-NEXT:    store i64 [[TMP2]], ptr [[RESULT5]], align 8
 ; OPT-NEXT:    ret void
+;
   %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
@@ -289,7 +294,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
   ret void
 }
 
-define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
+define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<5>;
@@ -319,7 +324,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-NEXT:    ld.param.b32 %r3, [retval0];
 ; PTX-NEXT:    } // callseq 2
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_partial_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
 ; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
@@ -330,6 +335,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; OPT-NEXT:    store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %input
   %twice = add i32 %val, %val
   store i32 %twice, ptr %output
@@ -337,7 +343,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
   ret void
 }
 
-define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
+define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escapemem(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<6>;
@@ -369,7 +375,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
 ; PTX-NEXT:    } // callseq 3
 ; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define i32 @grid_const_partial_escapemem(
+; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
@@ -383,6 +389,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
 ; OPT-NEXT:    [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
 ; OPT-NEXT:    ret i32 [[ADD]]
+;
   %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %val1 = load i32, ptr %ptr1
   %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -393,7 +400,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
   ret i32 %add
 }
 
-define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; PTX-LABEL: grid_const_phi(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -415,7 +422,7 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; PTX-NEXT:    ld.u32 %r2, [%rd8];
 ; PTX-NEXT:    st.global.u32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_phi(
+; OPT-LABEL: define ptx_kernel void @grid_const_phi(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -435,6 +442,7 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
 
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
@@ -453,7 +461,7 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
 ; PTX-LABEL: grid_const_phi_ngc(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -478,7 +486,7 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(
 ; PTX-NEXT:    ld.u32 %r2, [%rd11];
 ; PTX-NEXT:    st.global.u32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_phi_ngc(
+; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -500,6 +508,7 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   br i1 %less, label %first, label %second
@@ -517,7 +526,7 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
 ; PTX-LABEL: grid_const_select(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -539,7 +548,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; PTX-NEXT:    ld.u32 %r2, [%rd9];
 ; PTX-NEXT:    st.global.u32 [%rd3], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_select(
+; OPT-LABEL: define ptx_kernel void @grid_const_select(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -553,6 +562,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   %ptrnew = select i1 %less, ptr %input1, ptr %input2
@@ -561,7 +571,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
   ret void
 }
 
-define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
+define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-LABEL: grid_const_ptrtoint(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -576,7 +586,7 @@ define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
 ; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define i32 @grid_const_ptrtoint(
+; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
 ; OPT-NEXT:    [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
@@ -584,6 +594,7 @@ define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; OPT-NEXT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
 ; OPT-NEXT:    [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
 ; OPT-NEXT:    ret i32 [[KEEPALIVE]]
+;
   %val = load i32, ptr %input
   %ptrval = ptrtoint ptr %input to i32
   %keepalive = add i32 %val, %ptrval
@@ -598,40 +609,40 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
 
 !nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23}
 
-!0 = !{ptr @grid_const_int, !"kernel", i32 1, !"grid_constant", !1}
+!0 = !{ptr @grid_const_int, !"grid_constant", !1}
 !1 = !{i32 1}
 
-!2 = !{ptr @grid_const_struct, !"kernel", i32 1, !"grid_constant", !3}
+!2 = !{ptr @grid_const_struct, !"grid_constant", !3}
 !3 = !{i32 1}
 
-!4 = !{ptr @grid_const_escape, !"kernel", i32 1, !"grid_constant", !5}
+!4 = !{ptr @grid_const_escape, !"grid_constant", !5}
 !5 = !{i32 1}
 
-!6 = !{ptr @multiple_grid_const_escape, !"kernel", i32 1, !"grid_constant", !7}
+!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7}
 !7 = !{i32 1, i32 3}
 
-!8 = !{ptr @grid_const_memory_escape, !"kernel", i32 1, !"grid_constant", !9}
+!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9}
 !9 = !{i32 1}
 
-!10 = !{ptr @grid_const_inlineasm_escape, !"kernel", i32 1, !"grid_constant", !11}
+!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11}
 !11 = !{i32 1}
 
-!12 = !{ptr @grid_const_partial_escape, !"kernel", i32 1, !"grid_constant", !13}
+!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13}
 !13 = !{i32 1}
 
-!14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
+!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15}
 !15 = !{i32 1}
 
-!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17}
+!16 = !{ptr @grid_const_phi, !"grid_constant", !17}
 !17 = !{i32 1}
 
-!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19}
+!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19}
 !19 = !{i32 1}
 
-!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21}
+!20 = !{ptr @grid_const_select, !"grid_constant", !21}
 !21 = !{i32 1}
 
-!22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
+!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23}
 !23 = !{i32 1}
 
 
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index eba4f27..269bba7 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -65,7 +65,7 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
 }
 
 ; COMMON-LABEL: ptr_generic
-define void @ptr_generic(ptr %out, ptr %in) {
+define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
 ; IRC:  %in3 = addrspacecast ptr %in to ptr addrspace(1)
 ; IRC:  %in4 = addrspacecast ptr addrspace(1) %in3 to ptr
 ; IRC:  %out1 = addrspacecast ptr %out to ptr addrspace(1)
@@ -87,7 +87,7 @@ define void @ptr_generic(ptr %out, ptr %in) {
 }
 
 ; COMMON-LABEL: ptr_nongeneric
-define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; IR-NOT: addrspacecast
 ; PTX-NOT: cvta.to.global
 ; PTX:  ld.const.u32
@@ -98,7 +98,7 @@ define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 }
 
 ; COMMON-LABEL: ptr_as_int
- define void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
+ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
 ; IR:   [[P:%.*]] = inttoptr i64 %i to ptr
 ; IRC:  [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
 ; IRC:  addrspacecast ptr addrspace(1) [[P1]] to ptr
@@ -121,7 +121,7 @@ define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 %struct.S = type { i64 }
 
 ; COMMON-LABEL: ptr_as_int_aggr
-define void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
+define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
 ; IR:   [[S:%.*]] = addrspacecast ptr %s to ptr addrspace(101)
 ; IR:   [[I:%.*]] = load i64, ptr addrspace(101) [[S]], align 8
 ; IR:   [[P0:%.*]] = inttoptr i64 [[I]] to ptr
@@ -146,8 +146,3 @@ define void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) ali
 
 ; Function Attrs: convergent nounwind
 declare dso_local ptr @escape(ptr) local_unnamed_addr
-!nvvm.annotations = !{!0, !1, !2, !3}
-!0 = !{ptr @ptr_generic, !"kernel", i32 1}
-!1 = !{ptr @ptr_nongeneric, !"kernel", i32 1}
-!2 = !{ptr @ptr_as_int, !"kernel", i32 1}
-!3 = !{ptr @ptr_as_int_aggr, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 5c52626..26102722 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -24,8 +24,8 @@ declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture read
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only(
+define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -35,7 +35,7 @@ define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocap
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -45,7 +45,7 @@ define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocap
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -62,8 +62,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep(
+define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -74,7 +74,7 @@ define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr n
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -85,7 +85,7 @@ define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr n
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -104,8 +104,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep_asc(
+define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -116,7 +116,7 @@ define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, p
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep_asc(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -127,7 +127,7 @@ define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, p
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep_asc(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -148,8 +148,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep_asc0(
+define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -164,7 +164,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep_asc0(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -179,7 +179,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep_asc0(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -202,8 +202,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr(
+define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; SM_60-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -214,7 +214,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
 ; SM_60-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]])
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; SM_70-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -225,7 +225,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
 ; SM_70-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]])
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; COPY-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -240,8 +240,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_gep(
+define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; SM_60-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -253,7 +253,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
 ; SM_60-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]])
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_gep(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; SM_70-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -265,7 +265,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
 ; SM_70-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]])
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_gep(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; COPY-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -282,8 +282,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_store(
+define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -294,7 +294,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
 ; SM_60-NEXT:    store ptr [[S3]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -305,7 +305,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
 ; SM_70-NEXT:    store ptr [[S3]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_store(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -320,8 +320,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_gep_store(
+define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -333,7 +333,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
 ; SM_60-NEXT:    store ptr [[B]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_gep_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -345,7 +345,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
 ; SM_70-NEXT:    store ptr [[B]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_gep_store(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -362,8 +362,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptrtoint(
+define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -375,7 +375,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
 ; SM_60-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptrtoint(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -387,7 +387,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
 ; SM_70-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptrtoint(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -404,8 +404,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_from_param(
+define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -414,7 +414,7 @@ define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, p
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_from_param(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -423,7 +423,7 @@ define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, p
 ; SM_70-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_from_param(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -438,8 +438,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_from_param_noalign(
+define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -448,7 +448,7 @@ define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonl
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_from_param_noalign(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -457,7 +457,7 @@ define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonl
 ; SM_70-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_from_param_noalign(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 8
@@ -472,8 +472,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_to_param(
+define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; SM_60-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -484,7 +484,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
 ; SM_60-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_to_param(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; SM_70-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -495,7 +495,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
 ; SM_70-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_to_param(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; COPY-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -510,8 +510,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @copy_on_store(
+define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; SM_60-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -523,7 +523,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
 ; SM_60-NEXT:    store i32 [[I]], ptr [[S3]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @copy_on_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; SM_70-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -535,7 +535,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
 ; SM_70-NEXT:    store i32 [[I]], ptr [[S3]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @copy_on_store(
+; COPY-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; COPY-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -551,8 +551,8 @@ bb:
   ret void
 }
 
-define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
-; SM_60-LABEL: define void @test_select(
+define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_select(
 ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -568,7 +568,7 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
 ; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[OUT8]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_select(
+; SM_70-LABEL: define ptx_kernel void @test_select(
 ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -582,7 +582,7 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
 ; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_select(
+; COPY-LABEL: define ptx_kernel void @test_select(
 ; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca i32, align 4
@@ -603,8 +603,8 @@ bb:
   ret void
 }
 
-define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
-; SM_60-LABEL: define void @test_select_write(
+define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_select_write(
 ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -619,7 +619,7 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; SM_60-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_select_write(
+; SM_70-LABEL: define ptx_kernel void @test_select_write(
 ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -634,7 +634,7 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; SM_70-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_select_write(
+; COPY-LABEL: define ptx_kernel void @test_select_write(
 ; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca i32, align 4
@@ -653,8 +653,8 @@ bb:
   ret void
 }
 
-define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
-; SM_60-LABEL: define void @test_phi(
+define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_phi(
 ; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
@@ -678,7 +678,7 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
 ; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT8]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_phi(
+; SM_70-LABEL: define ptx_kernel void @test_phi(
 ; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
@@ -700,7 +700,7 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
 ; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_phi(
+; COPY-LABEL: define ptx_kernel void @test_phi(
 ; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8
@@ -740,8 +740,8 @@ merge:                                            ; preds = %second, %first
   ret void
 }
 
-define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
-; COMMON-LABEL: define void @test_phi_write(
+define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
+; COMMON-LABEL: define ptx_kernel void @test_phi_write(
 ; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; COMMON-NEXT:  [[BB:.*:]]
 ; COMMON-NEXT:    [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
@@ -784,29 +784,11 @@ attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite
 attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
-!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !23}
 !llvm.ident = !{!20, !21}
 
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{i32 7, !"frame-pointer", i32 2}
-!4 = !{ptr @read_only, !"kernel", i32 1}
-!5 = !{ptr @escape_ptr, !"kernel", i32 1}
-!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1}
-!7 = !{ptr @escape_ptr_store, !"kernel", i32 1}
-!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1}
-!9 = !{ptr @escape_ptrtoint, !"kernel", i32 1}
-!10 = !{ptr @memcpy_from_param, !"kernel", i32 1}
-!11 = !{ptr @memcpy_to_param, !"kernel", i32 1}
-!12 = !{ptr @copy_on_store, !"kernel", i32 1}
-!13 = !{ptr @read_only_gep, !"kernel", i32 1}
-!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1}
-!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1}
-!16 = !{ptr @test_select, !"kernel", i32 1}
-!17 = !{ptr @test_phi, !"kernel", i32 1}
-!18 = !{ptr @test_phi_write, !"kernel", i32 1}
-!19 = !{ptr @test_select_write, !"kernel", i32 1}
 !20 = !{!"clang version 20.0.0git"}
 !21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
-!23 = !{ptr @memcpy_from_param_noalign, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
index f8b3b4b..4ee1ca3 100644
--- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@@ -43,7 +43,7 @@ define internal void @bar() {
   ret void
 }
 
-; CHECK-LABEL: define weak_odr void @"nvptx$device$init"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init"() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_end, align 8
@@ -60,7 +60,7 @@ define internal void @bar() {
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define weak_odr void @"nvptx$device$fini"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini"() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_end, align 8
@@ -82,12 +82,10 @@ define internal void @bar() {
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 
-; CHECK: [[META0:![0-9]+]] = !{ptr @"nvptx$device$init", !"kernel", i32 1}
 ; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
 ; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
 ; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
 ; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxclusterrank", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"kernel", i32 1}
 ; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
 ; CHECK: [[META7:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
 ; CHECK: [[META8:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
index 9ec690a..2e64c25 100644
--- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-nvidia-cuda"
 
 ; Verify that both %input and %output are converted to global pointers and then
 ; addrspacecast'ed back to the original type.
-define void @kernel(ptr %input, ptr %output) {
+define ptx_kernel void @kernel(ptr %input, ptr %output) {
 ; CHECK-LABEL: .visible .entry kernel(
 ; CHECK: cvta.to.global.u64
 ; CHECK: cvta.to.global.u64
@@ -17,7 +17,7 @@ define void @kernel(ptr %input, ptr %output) {
   ret void
 }
 
-define void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
+define ptx_kernel void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
 ; CHECK-LABEL: .visible .entry kernel2(
 ; CHECK-NOT: cvta.to.global.u64
   %1 = load float, ptr addrspace(1) %input, align 4
@@ -29,7 +29,7 @@ define void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
 
 %struct.S = type { ptr, ptr }
 
-define void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) {
+define ptx_kernel void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) {
 ; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
 ; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
 ; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
@@ -60,7 +60,3 @@ define void @ptr_in_byval_func(ptr byval(%struct.S) %input, ptr %output) {
   ret void
 }
 
-!nvvm.annotations = !{!0, !1, !2}
-!0 = !{ptr @kernel, !"kernel", i32 1}
-!1 = !{ptr @kernel2, !"kernel", i32 1}
-!2 = !{ptr @ptr_in_byval_kernel, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
index 3389e09..c445c34 100644
--- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
+++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
@@ -11,16 +11,15 @@ target triple = "nvptx64-unknown-unknown"
 
 ; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
 ; sielently ignored.
-define dso_local void @_Z18TestMaxClusterRankv() {
+define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() {
 entry:
   %a = alloca i32, align 4
   store volatile i32 1, ptr %a, align 4
   ret void
 }
 
-!nvvm.annotations = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!1, !2, !3}
 
-!0 = !{ptr @_Z18TestMaxClusterRankv, !"kernel", i32 1}
 !1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
 !2 = !{ptr @_Z18TestMaxClusterRankv, !"minctasm", i32 2}
 !3 = !{ptr @_Z18TestMaxClusterRankv, !"maxclusterrank", i32 8}
diff --git a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
index 2bc6d4c..2a0c5ab 100644
--- a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -66,7 +66,4 @@ if.end17:                                         ; preds = %if.else13, %if.then
 }
 
 ; Function Attrs: noduplicate nounwind
-declare void @llvm.nvvm.barrier0() #2
-
-!0 = !{ptr @foo, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
+declare void @llvm.nvvm.barrier0() #2
+\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/noreturn.ll b/llvm/test/CodeGen/NVPTX/noreturn.ll
index 2161d70..6c11d0a 100644
--- a/llvm/test/CodeGen/NVPTX/noreturn.ll
+++ b/llvm/test/CodeGen/NVPTX/noreturn.ll
@@ -27,7 +27,7 @@ define void @true_noreturn0() #0 {
 ; CHECK: .entry ignore_kernel_noreturn()
 ; CHECK-NOT: .noreturn
 
-define void @ignore_kernel_noreturn() #0 {
+define ptx_kernel void @ignore_kernel_noreturn() #0 {
   unreachable
 }
 
@@ -35,7 +35,7 @@ define void @ignore_kernel_noreturn() #0 {
 ; CHECK: prototype_{{[0-9]+}} : .callprototype ()_ (.param .b32 _) .noreturn;
 ; CHECK: prototype_{{[0-9]+}} : .callprototype (.param .b32 _) _ (.param .b32 _);
 
-define void @callprototype_noreturn(i32) {
+define ptx_kernel void @callprototype_noreturn(i32) {
   %fn = load ptr, ptr addrspace(1) @function_pointer
   call void %fn(i32 %0) #0
   %non_void = bitcast ptr %fn to ptr
@@ -44,8 +44,3 @@ define void @callprototype_noreturn(i32) {
 }
 
 attributes #0 = { noreturn }
-
-!nvvm.annotations = !{!0, !1}
-
-!0 = !{ptr @ignore_kernel_noreturn, !"kernel", i32 1}
-!1 = !{ptr @callprototype_noreturn, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
index 48162ea..9a78d31 100644
--- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
@@ -3,7 +3,7 @@
 
 target triple = "nvptx-unknown-nvcl"
 
-define void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
+define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
 ; The parameter alignment is determined by the align attribute (default 1).
 ; CHECK-LABEL: .entry foo(
 ; CHECK: .param .u64 .ptr .align 32 foo_param_2
@@ -11,7 +11,6 @@ define void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
+!nvvm.annotations = !{!2, !3}
 !2 = !{ptr @foo, !"rdoimage", i32 0}
 !3 = !{ptr @foo, !"sampler", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
index ac5875c..83cb3cd 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -1,9 +1,9 @@
 ; Libdevice in recent CUDA versions relies on __CUDA_ARCH reflecting GPU type.
 ; Verify that __nvvm_reflect() is replaced with an appropriate value.
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
 
 @"$str" = private addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
index 9d38321..bf8d6e2 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
@@ -1,8 +1,8 @@
 ; Verify that __nvvm_reflect_ocl() is replaced with an appropriate value
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
 
 @"$str" = private addrspace(4) constant [12 x i8] c"__CUDA_ARCH\00"
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
index 46ab79d..19c74df 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,7 @@ exit:
 
 declare i32 @llvm.nvvm.reflect(ptr)
 
-; CHECK-LABEL: define noundef i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
index 2ed9f7c..244b44f 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,8 @@ exit:
 
 declare i32 @llvm.nvvm.reflect(ptr)
 
-; CHECK-LABEL: define noundef i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
+
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0
diff --git a/llvm/test/CodeGen/NVPTX/refl1.ll b/llvm/test/CodeGen/NVPTX/refl1.ll
index 34db3bb..99b83f4 100644
--- a/llvm/test/CodeGen/NVPTX/refl1.ll
+++ b/llvm/test/CodeGen/NVPTX/refl1.ll
@@ -5,7 +5,7 @@ target triple = "nvptx-nvidia-cuda"
 
 ; Function Attrs: nounwind
 ; CHECK: .entry foo
-define void @foo(ptr nocapture %a) #0 {
+define ptx_kernel void @foo(ptr nocapture %a) #0 {
   %val = load float, ptr %a
   %tan = tail call fastcc float @__nv_fast_tanf(float %val)
   store float %tan, ptr %a
@@ -34,7 +34,3 @@ entry:
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { alwaysinline inlinehint nounwind readnone }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/reg-copy.ll b/llvm/test/CodeGen/NVPTX/reg-copy.ll
index f66ef19..20396c4 100644
--- a/llvm/test/CodeGen/NVPTX/reg-copy.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-copy.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-define void @PR24303(ptr %f) {
+define ptx_kernel void @PR24303(ptr %f) {
 ; CHECK-LABEL: .visible .entry PR24303(
 ; Do not use mov.f or mov.u to convert between float and int.
 ; CHECK-NOT: mov.{{f|u}}{{32|64}} %f{{[0-9]+}}, %r{{[0-9]+}}
@@ -217,7 +217,3 @@ _ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit: ; preds = %if.then.93.i, %
 }
 
 declare float @llvm.nvvm.fabs.f(float)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @PR24303, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll
index 3580604..991ae04 100644
--- a/llvm/test/CodeGen/NVPTX/simple-call.ll
+++ b/llvm/test/CodeGen/NVPTX/simple-call.ll
@@ -10,7 +10,7 @@ define float @device_func(float %a) noinline {
 }
 
 ; CHECK: .entry kernel_func
-define void @kernel_func(ptr %a) {
+define ptx_kernel void @kernel_func(ptr %a) {
   %val = load float, ptr %a
 ; CHECK: call.uni (retval0),
 ; CHECK: device_func,
@@ -18,9 +18,3 @@ define void @kernel_func(ptr %a) {
   store float %mul, ptr %a
   ret void
 }
-
-
-
-!nvvm.annotations = !{!1}
-
-!1 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index 504dcde..7a7904a 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
 
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -34,7 +34,7 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
 
 @surf0 = internal addrspace(1) global i64 0, align 8
 
-define void @bar(ptr %red, i32 %idx) {
+define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -56,11 +56,5 @@ define void @bar(ptr %red, i32 %idx) {
   ret void
 }
 
-
-
-
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
-
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll
index e0cebd6..cd11b56 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-unknown-nvcl"
 declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
 ; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
@@ -16,6 +16,5 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @foo, !"rdwrimage", i32 0}
+!nvvm.annotations = !{!1}
+!1 = !{ptr @foo, !"rdwrimage", i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py
index 9607a58..90d6766 100644
--- a/llvm/test/CodeGen/NVPTX/surf-tex.py
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -224,11 +224,6 @@ def get_ptx_surface(target):
 def get_surface_metadata(target, fun_ty, fun_name, has_surface_param):
     metadata = []
 
-    md_kernel = '!{{{fun_ty} @{fun_name}, !"kernel", i32 1}}'.format(
-        fun_ty=fun_ty, fun_name=fun_name
-    )
-    metadata.append(md_kernel)
-
     if target == "cuda":
         # When a parameter is lowered as a .surfref, it still has the
         # corresponding ld.param.u64, which is illegal. Do not emit the
@@ -263,14 +258,14 @@ def gen_suld_tests(target, global_surf):
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${reg_ret}, [${reg_surf}, ${reg_access}]
   ;
-  define void @${test_name}_param(i64 %s, ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %s, ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %s, ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${reg_ret}, [${global_surf}, ${reg_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
     %val = tail call ${retty} @${intrinsic}(i64 %gs, ${access})
     store ${retty} %val, ${retty}* %ret
@@ -356,13 +351,13 @@ def gen_sust_tests(target, global_surf):
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} [${reg_surf}, ${reg_access}], ${reg_value}
   ;
-  define void @${test_name}_param(i64 %s, ${value}, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %s, ${value}, ${access}) {
     tail call void @${intrinsic}(i64 %s, ${access}, ${value})
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} [${global_surf}, ${reg_access}], ${reg_value}
-  define void @${test_name}_global(${value}, ${access}) {
+  define ptx_kernel void @${test_name}_global(${value}, ${access}) {
     %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
     tail call void @${intrinsic}(i64 %gs, ${access}, ${value})
     ret void
@@ -420,19 +415,13 @@ def gen_sust_tests(target, global_surf):
         generated_items.append((params["intrinsic"], params["instruction"]))
 
         fun_name = test_name + "_param"
-        fun_ty = "void (i64, {value_ty}, {access_ty})*".format(
-            value_ty=get_llvm_value_type(vec, ctype),
-            access_ty=get_llvm_surface_access_type(geom),
-        )
+        fun_ty = "ptr"
         generated_metadata += get_surface_metadata(
             target, fun_ty, fun_name, has_surface_param=True
         )
 
         fun_name = test_name + "_global"
-        fun_ty = "void ({value_ty}, {access_ty})*".format(
-            value_ty=get_llvm_value_type(vec, ctype),
-            access_ty=get_llvm_surface_access_type(geom),
-        )
+        fun_ty = "ptr"
         generated_metadata += get_surface_metadata(
             target, fun_ty, fun_name, has_surface_param=False
         )
@@ -559,11 +548,6 @@ def get_ptx_global_sampler(target, global_sampler):
 def get_texture_metadata(target, fun_ty, fun_name, has_texture_params):
     metadata = []
 
-    md_kernel = '!{{{fun_ty} @{fun_name}, !"kernel", i32 1}}'.format(
-        fun_ty=fun_ty, fun_name=fun_name
-    )
-    metadata.append(md_kernel)
-
     if target == "cuda":
         # When a parameter is lowered as a .texref, it still has the
         # corresponding ld.param.u64, which is illegal. Do not emit the
@@ -615,14 +599,14 @@ def gen_tex_tests(target, global_tex, global_sampler):
 
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${ptx_ret}, [${ptx_tex}, ${ptx_access}]
-  define void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
     ${get_sampler_handle}
     %val = tail call ${retty} @${intrinsic}(i64 %gt, ${sampler} ${access})
@@ -799,14 +783,14 @@ def gen_tld4_tests(target, global_tex, global_sampler):
 
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${ptx_ret}, [${ptx_tex}, ${ptx_access}]
-  define void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
     ${get_sampler_handle}
     %val = tail call ${retty} @${intrinsic}(i64 %gt, ${sampler} ${access})
diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
index 881ea45..5dc44cb 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -10,7 +10,7 @@ declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
 
-define void @foo(i64 %img, i32 %val, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i32 %val, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -30,7 +30,7 @@ define void @foo(i64 %img, i32 %val, i32 %idx) {
 @surf0 = internal addrspace(1) global i64 0, align 8
 
 
-define void @bar(i32 %val, i32 %idx) {
+define ptx_kernel void @bar(i32 %val, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -47,8 +47,6 @@ define void @bar(i32 %val, i32 %idx) {
 }
 
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
 
diff --git a/llvm/test/CodeGen/NVPTX/surf-write.ll b/llvm/test/CodeGen/NVPTX/surf-write.ll
index 258bb6d..0e1f0cc 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write.ll
@@ -6,12 +6,11 @@ target triple = "nvptx-unknown-nvcl"
 declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, i32 %val, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i32 %val, i32 %idx) {
 ; CHECK: sust.b.1d.b32.trap [foo_param_0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
   tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
   ret void
 }
 
-!nvvm.annotations = !{!1, !2}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @foo, !"wroimage", i32 0}
+!nvvm.annotations = !{!1}
+!1 = !{ptr @foo, !"wroimage", i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index ba556d2d..61837bd 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -10,7 +10,7 @@ target triple = "nvptx-unknown-cuda"
 declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -34,7 +34,7 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
 
 @tex0 = internal addrspace(1) global i64 0, align 8
 
-define void @bar(ptr %red, i32 %idx) {
+define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -57,7 +57,7 @@ define void @bar(ptr %red, i32 %idx) {
 
 declare float @texfunc(i64)
 
-define void @baz(ptr %red, i32 %idx) {
+define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-LABEL: baz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -93,8 +93,5 @@ define void @baz(ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3, !4}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @tex0, !"texture", i32 1}
-!4 = !{ptr @baz, !"kernel", i32 1}
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @tex0, !"texture", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll
index d11aea4..d74c89f5 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-unknown-nvcl"
 declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
 ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
@@ -15,7 +15,6 @@ define void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
+!nvvm.annotations = !{!2, !3}
 !2 = !{ptr @foo, !"rdoimage", i32 0}
 !3 = !{ptr @foo, !"sampler", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/unreachable.ll b/llvm/test/CodeGen/NVPTX/unreachable.ll
index 286f358..80cf938 100644
--- a/llvm/test/CodeGen/NVPTX/unreachable.ll
+++ b/llvm/test/CodeGen/NVPTX/unreachable.ll
@@ -21,7 +21,7 @@ target triple = "nvptx-unknown-cuda"
 declare void @throw() #0
 declare void @llvm.trap() #0
 
-define void @kernel_func() {
+define ptx_kernel void @kernel_func() {
 ; NO-TRAP-UNREACHABLE-LABEL: kernel_func(
 ; NO-TRAP-UNREACHABLE:       {
 ; NO-TRAP-UNREACHABLE-EMPTY:
@@ -102,6 +102,3 @@ define void @kernel_func_2() {
 }
 
 attributes #0 = { noreturn }
-
-!nvvm.annotations = !{!1}
-!1 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index cb54812..f7ed690 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -153,7 +153,7 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<6>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot2;
@@ -163,24 +163,20 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
 ; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd3];
-; CHECK-PTX-NEXT:    or.b64 %rd4, %rd3, 4;
-; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd4];
-; CHECK-PTX-NEXT:    or.b64 %rd5, %rd3, 5;
-; CHECK-PTX-NEXT:    or.b64 %rd6, %rd3, 7;
-; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd6];
+; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd3+4];
+; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd3+7];
 ; CHECK-PTX-NEXT:    st.u8 [%SP+2], %rs1;
-; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd5];
-; CHECK-PTX-NEXT:    or.b64 %rd7, %rd3, 6;
-; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd7];
+; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd3+5];
+; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd3+6];
 ; CHECK-PTX-NEXT:    shl.b16 %rs4, %rs3, 8;
 ; CHECK-PTX-NEXT:    or.b16 %rs5, %rs4, %rs2;
 ; CHECK-PTX-NEXT:    st.u16 [%SP], %rs5;
-; CHECK-PTX-NEXT:    ld.u64 %rd8, [%rd3+8];
+; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3+8];
 ; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd9, %r5;
-; CHECK-PTX-NEXT:    add.s64 %rd10, %rd9, %rd8;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd10;
+; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r5;
+; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
+; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd6;
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-PTX-NEXT:    ret;
 entry:
@@ -219,7 +215,7 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<10>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot3;
@@ -240,17 +236,16 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    st.u16 [%SP], %rs8;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 8;
-; CHECK-PTX-NEXT:    or.b64 %rd6, %rd5, 4;
 ; CHECK-PTX-NEXT:    mov.b16 %rs9, 1;
-; CHECK-PTX-NEXT:    st.u8 [%rd6], %rs9;
-; CHECK-PTX-NEXT:    mov.b64 %rd7, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd7;
+; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs9;
+; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
+; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
+; CHECK-PTX-NEXT:    add.u64 %rd6, %SP, 8;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
 ; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd6;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
 ; CHECK-PTX-NEXT:    call.uni (retval0),
 ; CHECK-PTX-NEXT:    variadics2,
diff --git a/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll b/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll
new file mode 100644
index 0000000..ec6fd7e
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll
@@ -0,0 +1,20 @@
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr7 < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr7 --filetype=obj -o %t.o < %s
+; RUN: llvm-objdump --syms %t.o | FileCheck %s --check-prefix=OBJ
+
+%struct.anon = type {}
+
+@a = internal constant %struct.anon zeroinitializer, align 1
+@b = internal constant [6 x i8] c"hello\00", align 1
+
+; CHECK:      	.csect L.._MergedGlobals[RO],2
+; CHECK-NEXT: 	.lglobl	a                               # @_MergedGlobals
+; CHECK-NEXT: 	.lglobl	b
+; CHECK-NEXT: a:
+; CHECK-NEXT: b:
+; CHECK-NEXT: 	.string	"hello"
+
+; OBJ:      0000000000000000 l       .text	0000000000000006 L.._MergedGlobals
+; OBJ-NEXT: 0000000000000000 l       .text (csect: L.._MergedGlobals) 	0000000000000000 a
+; OBJ-NEXT: 0000000000000000 l       .text (csect: L.._MergedGlobals) 	0000000000000000 b
diff --git a/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll b/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
index 83763f5..2c5ad94 100644
--- a/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
+++ b/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -print-lsr-output 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -stop-after=loop-reduce | FileCheck %s
 
 ; The icmp is a post-inc use, and the increment is in %bb11, but the
 ; scevgep needs to be inserted in %bb so that it is dominated by %t.
diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll
index 0634534..39f9269 100644
--- a/llvm/test/CodeGen/PowerPC/memcmp.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmp.ll
@@ -6,13 +6,12 @@ define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture reado
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ldbrx 3, 0, 3
 ; CHECK-NEXT:    ldbrx 4, 0, 4
-; CHECK-NEXT:    subc 5, 4, 3
-; CHECK-NEXT:    subfe 5, 4, 4
-; CHECK-NEXT:    subc 4, 3, 4
-; CHECK-NEXT:    subfe 3, 3, 3
-; CHECK-NEXT:    neg 5, 5
+; CHECK-NEXT:    cmpld 3, 4
+; CHECK-NEXT:    subc 3, 4, 3
+; CHECK-NEXT:    subfe 3, 4, 4
+; CHECK-NEXT:    li 4, -1
 ; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    sub 3, 5, 3
+; CHECK-NEXT:    isellt 3, 4, 3
 ; CHECK-NEXT:    extsw 3, 3
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8)
@@ -24,12 +23,11 @@ define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture reado
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lwbrx 3, 0, 3
 ; CHECK-NEXT:    lwbrx 4, 0, 4
+; CHECK-NEXT:    cmplw 3, 4
 ; CHECK-NEXT:    sub 5, 4, 3
-; CHECK-NEXT:    sub 3, 3, 4
+; CHECK-NEXT:    li 3, -1
 ; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    rldicl 3, 3, 1, 63
-; CHECK-NEXT:    sub 3, 5, 3
-; CHECK-NEXT:    extsw 3, 3
+; CHECK-NEXT:    isellt 3, 3, 5
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
   ret i32 %call
diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
index 0a8bec7..b57d2b5 100644
--- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
@@ -59,22 +59,14 @@ define signext i32 @test2(ptr nocapture readonly %buffer1, ptr nocapture readonl
   ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
   ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
   ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
-  ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
-  ; CHECK-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
-  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
-  ; CHECK-NEXT: ret i32 [[SUB]]
+  ; CHECK-NEXT: [[UCMP:%[0-9]+]] = call i32 @llvm.ucmp.i32.i32(i32 [[BSWAP1]], i32 [[BSWAP2]])
+  ; CHECK-NEXT: ret i32 [[UCMP]]
 
   ; CHECK-BE-LABEL: @test2(
   ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
   ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
-  ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
-  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
-  ; CHECK-BE-NEXT: ret i32 [[SUB]]
+  ; CHECK-BE-NEXT: [[UCMP:%[0-9]+]] = call i32 @llvm.ucmp.i32.i32(i32 [[LOAD1]], i32 [[LOAD2]])
+  ; CHECK-BE-NEXT: ret i32 [[UCMP]]
 
 entry:
   %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
diff --git a/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll b/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
index 025a5ad..2f0b929 100644
--- a/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
@@ -13,6 +13,8 @@ $bar1 = comdat any
 @bar2 = global i32 0, align 4, comdat($bar1)
 
 declare i32 @call_foo(i32, ...)
+declare i32 @call_strictfp() strictfp
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 
 define i32 @foo() {
 entry:
@@ -21,6 +23,23 @@ entry:
   ret i32 %0
 }
 
+define i32 @foo1() strictfp {
+entry:
+  %call = call i32 (i32, ...) @call_foo(i32 0)
+  ret i32 %call
+}
+
+define i32 @foo1_strictfp() strictfp {
+entry:
+  %call = call i32 () @call_strictfp()
+  ret i32 %call
+}
+
+define void @foo2(ptr %a) {
+  call void @llvm.memset.p0.i64(ptr align 1 %a, i8 1, i64 1000, i1 false)
+  ret void
+}
+
 define i32 @load() {
 entry:
   %0 = load i32, ptr @bar1
@@ -49,6 +68,31 @@ entry:
 ; LARGE-SECUREPLT:   addi 30, 30, .LTOC-.L0$pb@l
 ; LARGE-SECUREPLT:   bl call_foo@PLT+32768
 
+; LARGE-SECUREPLT-LABEL: foo1:
+; LARGE-SECUREPLT:       .L1$pb:
+; LARGE-SECUREPLT-NEXT:    crxor 6, 6, 6
+; LARGE-SECUREPLT-NEXT:    mflr 30
+; LARGE-SECUREPLT-NEXT:    addis 30, 30, .LTOC-.L1$pb@ha
+; LARGE-SECUREPLT-NEXT:    addi 30, 30, .LTOC-.L1$pb@l
+; LARGE-SECUREPLT-NEXT:    li 3, 0
+; LARGE-SECUREPLT-NEXT:    bl call_foo@PLT+32768
+
+; LARGE-SECUREPLT-LABEL: foo1_strictfp:
+; LARGE-SECUREPLT:       .L2$pb:
+; LARGE-SECUREPLT-NEXT:    mflr 30
+; LARGE-SECUREPLT-NEXT:    addis 30, 30, .LTOC-.L2$pb@ha
+; LARGE-SECUREPLT-NEXT:    addi 30, 30, .LTOC-.L2$pb@l
+; LARGE-SECUREPLT-NEXT:    bl call_strictfp@PLT+32768
+
+; LARGE-SECUREPLT-LABEL: foo2:
+; LARGE-SECUREPLT:       .L3$pb:
+; LARGE-SECUREPLT:         mflr 30
+; LARGE-SECUREPLT-NEXT:    addis 30, 30, .LTOC-.L3$pb@ha
+; LARGE-SECUREPLT-NEXT:    addi 30, 30, .LTOC-.L3$pb@l
+; LARGE-SECUREPLT:         bl memset@PLT+32768
+
+; LARGE-SECUREPLT-LABEEL: load:
+
 ; LARGE:      .section .bss.bar1,"awG",@nobits,bar1,comdat
 ; LARGE:      bar1:
 ; LARGE:      .section .bss.bar2,"awG",@nobits,bar1,comdat
diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
index b6799c8..f62f70c 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
@@ -11,24 +11,35 @@
 define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) {
 ; P9LE-LABEL: scalar_to_vector_half:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    lhz r3, 0(r3)
+; P9LE-NEXT:    lxsihzx v2, 0, r3
+; P9LE-NEXT:    li r3, 0
+; P9LE-NEXT:    vsplth v2, v2, 3
+; P9LE-NEXT:    vextubrx r3, r3, v2
 ; P9LE-NEXT:    blr
 ;
 ; P9BE-LABEL: scalar_to_vector_half:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    lhz r3, 0(r3)
-; P9BE-NEXT:    srwi r3, r3, 24
+; P9BE-NEXT:    lxsihzx v2, 0, r3
+; P9BE-NEXT:    li r3, 0
+; P9BE-NEXT:    vsplth v2, v2, 3
+; P9BE-NEXT:    vextublx r3, r3, v2
 ; P9BE-NEXT:    blr
 ;
 ; P8LE-LABEL: scalar_to_vector_half:
 ; P8LE:       # %bb.0: # %entry
 ; P8LE-NEXT:    lhz r3, 0(r3)
+; P8LE-NEXT:    mtfprd f0, r3
+; P8LE-NEXT:    mffprd r3, f0
+; P8LE-NEXT:    clrldi r3, r3, 56
 ; P8LE-NEXT:    blr
 ;
 ; P8BE-LABEL: scalar_to_vector_half:
 ; P8BE:       # %bb.0: # %entry
 ; P8BE-NEXT:    lhz r3, 0(r3)
-; P8BE-NEXT:    srwi r3, r3, 24
+; P8BE-NEXT:    sldi r3, r3, 48
+; P8BE-NEXT:    mtfprd f0, r3
+; P8BE-NEXT:    mffprd r3, f0
+; P8BE-NEXT:    rldicl r3, r3, 8, 56
 ; P8BE-NEXT:    blr
 entry:
     %0 = load <2 x i8>, ptr %ad, align 1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
index ff56ab1..0fd23a7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
@@ -14,7 +14,7 @@ define i32 @add_positive_low_bound_reject(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_positive_low_bound_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2047
   ret i32 %1
@@ -30,7 +30,7 @@ define i32 @add_positive_low_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_low_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    addiw a0, a0, 1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2048
   ret i32 %1
@@ -46,7 +46,7 @@ define i32 @add_positive_high_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_high_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 4094
   ret i32 %1
@@ -63,8 +63,8 @@ define i32 @add_positive_high_bound_reject(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_high_bound_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1
-; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 4095
   ret i32 %1
@@ -78,7 +78,7 @@ define i32 @add_negative_high_bound_reject(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_negative_high_bound_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -2048
   ret i32 %1
@@ -94,7 +94,7 @@ define i32 @add_negative_high_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_high_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, -2048
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    addiw a0, a0, -1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -2049
   ret i32 %1
@@ -110,7 +110,7 @@ define i32 @add_negative_low_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_low_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, -2048
-; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -4096
   ret i32 %1
@@ -127,8 +127,8 @@ define i32 @add_negative_low_bound_reject(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_low_bound_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1048575
-; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -4097
   ret i32 %1
@@ -144,7 +144,7 @@ define i32 @add32_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add32_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 952
+; RV64I-NEXT:    addiw a0, a0, 952
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2999
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
index ee41499..f1c0fcc 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
@@ -37,7 +37,7 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) {
 ; RV64IM-NEXT:    slli a1, a1, 56
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    srai a1, a1, 56
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sext i8 %a to i32
@@ -58,7 +58,7 @@ define i32 @add_i8_zeroext_i32(i8 %a, i8 %b) {
 ; RV64IM:       # %bb.0: # %entry
 ; RV64IM-NEXT:    andi a0, a0, 255
 ; RV64IM-NEXT:    andi a1, a1, 255
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = zext i8 %a to i32
@@ -78,7 +78,7 @@ define i32 @add_i32(i32 %a, i32 %b) {
 ;
 ; RV64IM-LABEL: add_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = add i32 %a, %b
@@ -93,7 +93,7 @@ define i32 @addi_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: addi_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    addi a0, a0, 1234
+; RV64IM-NEXT:    addiw a0, a0, 1234
 ; RV64IM-NEXT:    ret
 entry:
   %0 = add i32 %a, 1234
@@ -108,7 +108,7 @@ define i32 @sub_i32(i32 %a, i32 %b) {
 ;
 ; RV64IM-LABEL: sub_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    sub a0, a0, a1
+; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 %a, %b
@@ -123,7 +123,7 @@ define i32 @subi_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: subi_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    addi a0, a0, -1234
+; RV64IM-NEXT:    addiw a0, a0, -1234
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 %a, 1234
@@ -138,7 +138,7 @@ define i32 @neg_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: neg_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    neg a0, a0
+; RV64IM-NEXT:    negw a0, a0
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 0, %a
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
new file mode 100644
index 0000000..3a55189
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
@@ -0,0 +1,453 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -global-isel -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32ZBB
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64I
+; RUN: llc -mtriple=riscv64 -global-isel -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64ZBB
+
+define i32 @expanded_neg_abs32(i32 %x) {
+; RV32I-LABEL: expanded_neg_abs32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    blt a0, a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    min a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    blt a2, a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB0_2:
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    negw a1, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    max a0, a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.smax.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i32 @expanded_neg_abs32_unsigned(i32 %x) {
+; RV32I-LABEL: expanded_neg_abs32_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    bltu a0, a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs32_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    minu a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs32_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    bltu a2, a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB1_2:
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs32_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    negw a1, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    maxu a0, a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.umax.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i64 @expanded_neg_abs64(i64 %x) {
+; RV32I-LABEL: expanded_neg_abs64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a2
+; RV32I-NEXT:    beqz a4, .LBB2_3
+; RV32I-NEXT:    j .LBB2_4
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    bnez a4, .LBB2_4
+; RV32I-NEXT:  .LBB2_3:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:  .LBB2_4:
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB2_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a4, a1, a2
+; RV32ZBB-NEXT:    beqz a4, .LBB2_3
+; RV32ZBB-NEXT:    j .LBB2_4
+; RV32ZBB-NEXT:  .LBB2_2:
+; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    bnez a4, .LBB2_4
+; RV32ZBB-NEXT:  .LBB2_3:
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:  .LBB2_4:
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    snez a1, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    blt a0, a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB2_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    min a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.smax.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i64 @expanded_neg_abs64_unsigned(i64 %x) {
+; RV32I-LABEL: expanded_neg_abs64_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a2
+; RV32I-NEXT:    beqz a4, .LBB3_3
+; RV32I-NEXT:    j .LBB3_4
+; RV32I-NEXT:  .LBB3_2:
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    bnez a4, .LBB3_4
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:  .LBB3_4:
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs64_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB3_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a4, a1, a2
+; RV32ZBB-NEXT:    beqz a4, .LBB3_3
+; RV32ZBB-NEXT:    j .LBB3_4
+; RV32ZBB-NEXT:  .LBB3_2:
+; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    bnez a4, .LBB3_4
+; RV32ZBB-NEXT:  .LBB3_3:
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:  .LBB3_4:
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    snez a1, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs64_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    bltu a0, a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB3_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs64_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    minu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.umax.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i32 @expanded_neg_inv_abs32(i32 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    blt a1, a0, .LBB4_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB4_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    max a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    blt a1, a2, .LBB4_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB4_2:
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    negw a1, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    min a0, a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.smin.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    bltu a1, a0, .LBB5_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB5_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    maxu a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    bltu a1, a2, .LBB5_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB5_2:
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    negw a1, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    minu a0, a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.umin.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i64 @expanded_neg_inv_abs64(i64 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a2, a1
+; RV32I-NEXT:    beqz a4, .LBB6_3
+; RV32I-NEXT:    j .LBB6_4
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    bnez a4, .LBB6_4
+; RV32I-NEXT:  .LBB6_3:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:  .LBB6_4:
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB6_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a4, a2, a1
+; RV32ZBB-NEXT:    beqz a4, .LBB6_3
+; RV32ZBB-NEXT:    j .LBB6_4
+; RV32ZBB-NEXT:  .LBB6_2:
+; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    bnez a4, .LBB6_4
+; RV32ZBB-NEXT:  .LBB6_3:
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:  .LBB6_4:
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    snez a1, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    blt a1, a0, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB6_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.smin.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a2, a1
+; RV32I-NEXT:    beqz a4, .LBB7_3
+; RV32I-NEXT:    j .LBB7_4
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    bnez a4, .LBB7_4
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:  .LBB7_4:
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB7_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a4, a2, a1
+; RV32ZBB-NEXT:    beqz a4, .LBB7_3
+; RV32ZBB-NEXT:    j .LBB7_4
+; RV32ZBB-NEXT:  .LBB7_2:
+; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    bnez a4, .LBB7_4
+; RV32ZBB-NEXT:  .LBB7_3:
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:  .LBB7_4:
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    snez a1, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    bltu a1, a0, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB7_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.umin.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
index 9c7fd68..360e84d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
@@ -21,6 +21,7 @@ define i32 @constant_to_rhs(i32 %x) {
 ; RV64-O0-NEXT:    mv a1, a0
 ; RV64-O0-NEXT:    li a0, 1
 ; RV64-O0-NEXT:    add a0, a0, a1
+; RV64-O0-NEXT:    sext.w a0, a0
 ; RV64-O0-NEXT:    ret
 ;
 ; RV32-OPT-LABEL: constant_to_rhs:
@@ -30,7 +31,7 @@ define i32 @constant_to_rhs(i32 %x) {
 ;
 ; RV64-OPT-LABEL: constant_to_rhs:
 ; RV64-OPT:       # %bb.0:
-; RV64-OPT-NEXT:    addi a0, a0, 1
+; RV64-OPT-NEXT:    addiw a0, a0, 1
 ; RV64-OPT-NEXT:    ret
   %a = add i32 1, %x
   ret i32 %a
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
index 385156b..4878699 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-
 ; RUN: llc -mtriple=riscv32 -mattr=+zfa,d -global-isel < %s \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV32IDZFA
 ; RUN: llc -mtriple=riscv64 -mattr=+zfa,d -global-isel < %s \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV64DZFA
 
 
 define double @fceil(double %a) {
@@ -86,3 +85,32 @@ define double @fminimum(double %a, double %b) {
   %c = call double @llvm.minimum.f64(double %a, double %b)
   ret double %c
 }
+
+define i64 @fmvh_x_d(double %fa) {
+; RV32IDZFA-LABEL: fmvh_x_d:
+; RV32IDZFA:       # %bb.0:
+; RV32IDZFA-NEXT:    fmv.x.w a0, fa0
+; RV32IDZFA-NEXT:    fmvh.x.d a1, fa0
+; RV32IDZFA-NEXT:    ret
+;
+; RV64DZFA-LABEL: fmvh_x_d:
+; RV64DZFA:       # %bb.0:
+; RV64DZFA-NEXT:    fmv.x.d a0, fa0
+; RV64DZFA-NEXT:    ret
+  %i = bitcast double %fa to i64
+  ret i64 %i
+}
+
+define double @fmvp_d_x(i64 %a) {
+; RV32IDZFA-LABEL: fmvp_d_x:
+; RV32IDZFA:       # %bb.0:
+; RV32IDZFA-NEXT:    fmvp.d.x fa0, a0, a1
+; RV32IDZFA-NEXT:    ret
+;
+; RV64DZFA-LABEL: fmvp_d_x:
+; RV64DZFA:       # %bb.0:
+; RV64DZFA-NEXT:    fmv.d.x fa0, a0
+; RV64DZFA-NEXT:    ret
+  %or = bitcast i64 %a to double
+  ret double %or
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
index 72f0ab1..234f338 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
@@ -96,12 +96,19 @@ define ptr @freeze_ptr(ptr %x) {
 %struct.T = type { i32, i32 }
 
 define i32 @freeze_struct(ptr %p) {
-; CHECK-LABEL: freeze_struct:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_struct:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_struct:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load %struct.T, ptr %p
   %y1 = freeze %struct.T %s
   %v1 = extractvalue %struct.T %y1, 0
@@ -111,12 +118,19 @@ define i32 @freeze_struct(ptr %p) {
 }
 
 define i32 @freeze_anonstruct(ptr %p) {
-; CHECK-LABEL: freeze_anonstruct:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_anonstruct:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_anonstruct:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load {i32, i32}, ptr %p
   %y1 = freeze {i32, i32} %s
   %v1 = extractvalue {i32, i32} %y1, 0
@@ -141,7 +155,7 @@ define i32 @freeze_anonstruct2(ptr %p) {
 ; RV64-NEXT:    lw a0, 0(a0)
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    srli a1, a1, 48
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a1
 ; RV64-NEXT:    ret
   %s = load {i32, i16}, ptr %p
   %y1 = freeze {i32, i16} %s
@@ -168,7 +182,7 @@ define i32 @freeze_anonstruct2_sext(ptr %p) {
 ; RV64-NEXT:    lw a0, 0(a0)
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    srai a1, a1, 48
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a1
 ; RV64-NEXT:    ret
   %s = load {i32, i16}, ptr %p
   %y1 = freeze {i32, i16} %s
@@ -180,12 +194,19 @@ define i32 @freeze_anonstruct2_sext(ptr %p) {
 }
 
 define i32 @freeze_array(ptr %p) nounwind {
-; CHECK-LABEL: freeze_array:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_array:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_array:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load [2 x i32], ptr %p
   %y1 = freeze [2 x i32] %s
   %v1 = extractvalue [2 x i32] %y1, 0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
index 1156edf..31a78d4 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
@@ -98,7 +98,7 @@ define i32 @abs32(i32 %x) {
 ; RV64I-LABEL: abs32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a1, a0, 31
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index a27e2b8..dbc1384 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -23,7 +23,7 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_SUB (opcode [[SUB_OPC:[0-9]+]]): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode [[SUB_OPC]] is aliased to [[ADD_OPC]]
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
@@ -59,7 +59,6 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_AND (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
index 22ce8a0..78a2227b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
@@ -86,9 +86,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[ASSERT_SEXT]], [[C]](s64)
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASSERT_SEXT]], [[ASHR]]
-    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ADD]], [[ASHR]]
-    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
-    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[SEXT_INREG]], [[ASHR]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG1]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: abs_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
index 48b65a1..8f2b9f3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
@@ -69,7 +69,8 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
index f2ec709..eed1aac 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
@@ -339,7 +339,7 @@ body:             |
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ICMP]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
     %2:_(s64) = COPY $x10
@@ -454,10 +454,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
-    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
-    ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG1]](s64), [[SEXT_INREG2]]
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ICMP]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
     %2:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
index 57fc513..e28572d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
@@ -145,7 +145,8 @@ body:             |
     ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -64769
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = G_CONSTANT i32 -64769
     %1:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
index 6cc5477..62d7313 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
@@ -59,7 +59,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s8) = G_TRUNC %1(s64)
@@ -129,7 +130,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s16) = G_TRUNC %1(s64)
@@ -175,16 +177,19 @@ body:             |
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[LSHR5]], [[C6]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[OR4]], [[AND6]]
-    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND7]], [[C2]](s64)
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[LSHR6]], [[C7]]
-    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C7]]
+    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C7]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND8]], [[AND9]]
-    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C3]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C3]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C8]]
+    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C8]]
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND10]], [[C9]]
@@ -192,7 +197,8 @@ body:             |
     ; RV64I-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[AND11]], [[C10]](s64)
     ; RV64I-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; RV64I-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C11]], [[LSHR8]]
-    ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB1]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG3]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: ctlz_i32
@@ -328,7 +334,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s8) = G_TRUNC %1(s64)
@@ -398,7 +405,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s16) = G_TRUNC %1(s64)
@@ -444,16 +452,19 @@ body:             |
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[LSHR5]], [[C6]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[OR4]], [[AND6]]
-    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND7]], [[C2]](s64)
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[LSHR6]], [[C7]]
-    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C7]]
+    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C7]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND8]], [[AND9]]
-    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C3]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C3]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C8]]
+    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C8]]
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND10]], [[C9]]
@@ -461,7 +472,8 @@ body:             |
     ; RV64I-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[AND11]], [[C10]](s64)
     ; RV64I-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; RV64I-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C11]], [[LSHR8]]
-    ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB1]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG3]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: ctlz_zero_undef_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
index 1493514..c61c46d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
@@ -129,18 +129,21 @@ body:             |
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C2]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[AND1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND2]], [[C3]](s64)
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C4]]
-    ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C4]]
+    ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C4]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND3]], [[AND4]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C5]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C5]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C6]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C6]]
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND5]], [[C7]]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
index 252e792..87155bb 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
@@ -131,7 +131,8 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[C]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[SEXT_INREG]]
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[AND]], [[C2]]
@@ -139,18 +140,21 @@ body:             |
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C3]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND]], [[AND2]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C2]]
+    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C2]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND3]], [[C4]](s64)
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C5]]
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C5]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C5]]
     ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[AND4]], [[AND5]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD1]], [[C6]](s64)
-    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD1]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG2]], [[C6]](s64)
+    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG2]]
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD2]], 32
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ADD2]], [[C7]]
+    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG3]], [[C7]]
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND6]], [[C8]]
@@ -351,7 +355,8 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[C]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[SEXT_INREG]]
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[AND]], [[C2]]
@@ -359,18 +364,21 @@ body:             |
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C3]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND]], [[AND2]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C2]]
+    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C2]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND3]], [[C4]](s64)
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C5]]
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C5]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C5]]
     ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[AND4]], [[AND5]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD1]], [[C6]](s64)
-    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD1]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG2]], [[C6]](s64)
+    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG2]]
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD2]], 32
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ADD2]], [[C7]]
+    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG3]], [[C7]]
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND6]], [[C8]]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
index f3bc1ce..aff7d4d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
@@ -30,8 +30,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C]]
     ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
index 4689a7d..776f5f5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
@@ -88,9 +88,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; RV64I-NEXT: [[SLLW:%[0-9]+]]:_(s64) = G_SLLW [[COPY]], [[AND]]
-    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[SRLW:%[0-9]+]]:_(s64) = G_SRLW [[COPY]], [[AND1]]
     ; RV64I-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SLLW]], [[SRLW]]
     ; RV64I-NEXT: $x10 = COPY [[OR]](s64)
@@ -233,9 +234,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; RV64I-NEXT: [[SRLW:%[0-9]+]]:_(s64) = G_SRLW [[COPY]], [[AND]]
-    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[SLLW:%[0-9]+]]:_(s64) = G_SLLW [[COPY]], [[AND1]]
     ; RV64I-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SRLW]], [[SLLW]]
     ; RV64I-NEXT: $x10 = COPY [[OR]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
index bf8c8d6..d162bfc 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
@@ -16,8 +16,8 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ADD]](s64)
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG]](s64)
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
@@ -97,7 +97,8 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -2147483648
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ADD1]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG3]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[COPY2]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
@@ -173,10 +174,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SUB]](s64)
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
-    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG1]](s64), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[TRUNC]]
@@ -250,7 +252,8 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -2147483648
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG3]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[COPY2]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
index da3ab9e..7ab07ee 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
@@ -69,7 +69,8 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 0b876fe..9df319e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -18,7 +18,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -30,15 +30,15 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -75,7 +75,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -87,15 +87,15 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -133,15 +133,14 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    li s0, 32
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    addiw a0, a0, -1
 ; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    beqz a2, .LBB2_2
+; RV64I-NEXT:    beqz a0, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -153,15 +152,15 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -200,7 +199,7 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a0, a0, 1
 ; RV64I-NEXT:    lui a1, 349525
 ; RV64I-NEXT:    or a0, s0, a0
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addi a1, a1, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -212,15 +211,15 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -271,7 +270,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    srliw a0, a0, 2
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srli a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srli a2, a0, 4
@@ -283,15 +282,15 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -299,7 +298,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
 ; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    subw a0, a1, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    .cfi_restore ra
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -408,19 +407,19 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -451,19 +450,19 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -493,19 +492,19 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, s0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -549,19 +548,19 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, s0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -669,18 +668,18 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -706,18 +705,18 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -746,19 +745,19 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lwu a0, 0(a0)
 ; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addi a1, a1, 1365
 ; RV64I-NEXT:    srli a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -1057,7 +1056,7 @@ define i32 @abs_i32(i32 %x) {
 ; RV64I-LABEL: abs_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a1, a0, 31
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
index 4346e04..daeb2e6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
@@ -97,7 +97,7 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    slt a2, a1, a0
 ; RV64I-NEXT:    slt a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
   ret i32 %1
@@ -122,7 +122,7 @@ define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slt a2, a1, a0
 ; RV64I-NEXT:    slt a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
index 9784c58..463883b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
@@ -97,7 +97,7 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -115,7 +115,7 @@ define i32 @ucmp.32.32_sext(i32 signext %x, i32 signext %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -135,7 +135,7 @@ define i32 @ucmp.32.32_zext(i32 zeroext %x, i32 zeroext %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -160,7 +160,7 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
index 47b6c07..d7f62ae 100644
--- a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
+++ b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
@@ -1,17 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefix=RV64 %s
+; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefixes=RV64,NO-ZBA %s
+; RUN: llc -mtriple=riscv64 -mattr=+zba < %s | FileCheck -check-prefixes=RV64,ZBA %s
 
 define void @add_sext_shl_moreOneUse_add(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a3, a1, 5
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a2, 20(a0)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    sw a3, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sw a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sw a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %idxprom = sext i32 %add to i64
@@ -29,19 +40,32 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_addexceedsign12(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a3, a1, 2047
-; RV64-NEXT:    lui a4, 2
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    addi a3, a3, 1
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    sw a2, 0(a0)
-; RV64-NEXT:    sw a3, 4(a0)
-; RV64-NEXT:    sw a2, 120(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 2047
+; NO-ZBA-NEXT:    lui a4, 2
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    addi a3, a3, 1
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a2, 0(a0)
+; NO-ZBA-NEXT:    sw a3, 4(a0)
+; NO-ZBA-NEXT:    sw a2, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 2047
+; ZBA-NEXT:    lui a4, 2
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    addi a3, a3, 1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a4
+; ZBA-NEXT:    sw a2, 0(a0)
+; ZBA-NEXT:    sw a3, 4(a0)
+; ZBA-NEXT:    sw a2, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %idxprom = sext i32 %add to i64
@@ -57,16 +81,26 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_sext(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_sext:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    addi a3, a1, 5
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a2, 20(a0)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    sd a3, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_sext:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sd a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_sext:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sd a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %idxprom = sext i32 %add to i64
@@ -85,20 +119,34 @@ entry:
 
 ; test of jumpping, find add's operand has one more use can simplified
 define void @add_sext_shl_moreOneUse_add_inSelect(ptr %array1, i32 signext  %a, i32 %b, i32 signext %x) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add_inSelect:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    bgtz a3, .LBB3_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a5, a2
-; RV64-NEXT:  .LBB3_2: # %entry
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a5, 20(a0)
-; RV64-NEXT:    sw a5, 24(a0)
-; RV64-NEXT:    sw a4, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB3_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB3_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a5, 20(a0)
+; NO-ZBA-NEXT:    sw a5, 24(a0)
+; NO-ZBA-NEXT:    sw a4, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB3_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB3_2: # %entry
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a5, 20(a0)
+; ZBA-NEXT:    sw a5, 24(a0)
+; ZBA-NEXT:    sw a4, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %cmp = icmp sgt i32 %x, 0
@@ -118,23 +166,40 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_add_inSelect_addexceedsign12(ptr %array1, i32 signext  %a, i32 %b, i32 signext %x) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 2047
-; RV64-NEXT:    lui a5, 2
-; RV64-NEXT:    slli a6, a1, 2
-; RV64-NEXT:    addi a1, a4, 1
-; RV64-NEXT:    add a0, a0, a6
-; RV64-NEXT:    add a0, a0, a5
-; RV64-NEXT:    mv a4, a1
-; RV64-NEXT:    bgtz a3, .LBB4_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a4, a2
-; RV64-NEXT:  .LBB4_2: # %entry
-; RV64-NEXT:    sw a4, 0(a0)
-; RV64-NEXT:    sw a4, 4(a0)
-; RV64-NEXT:    sw a1, 120(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 2047
+; NO-ZBA-NEXT:    lui a5, 2
+; NO-ZBA-NEXT:    slli a6, a1, 2
+; NO-ZBA-NEXT:    addi a1, a4, 1
+; NO-ZBA-NEXT:    add a0, a0, a6
+; NO-ZBA-NEXT:    add a0, a0, a5
+; NO-ZBA-NEXT:    mv a4, a1
+; NO-ZBA-NEXT:    bgtz a3, .LBB4_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a4, a2
+; NO-ZBA-NEXT:  .LBB4_2: # %entry
+; NO-ZBA-NEXT:    sw a4, 0(a0)
+; NO-ZBA-NEXT:    sw a4, 4(a0)
+; NO-ZBA-NEXT:    sw a1, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 2047
+; ZBA-NEXT:    lui a5, 2
+; ZBA-NEXT:    addi a4, a4, 1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a5
+; ZBA-NEXT:    mv a1, a4
+; ZBA-NEXT:    bgtz a3, .LBB4_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a1, a2
+; ZBA-NEXT:  .LBB4_2: # %entry
+; ZBA-NEXT:    sw a1, 0(a0)
+; ZBA-NEXT:    sw a1, 4(a0)
+; ZBA-NEXT:    sw a4, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %cmp = icmp sgt i32 %x, 0
@@ -152,20 +217,34 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect(ptr %array1, i64 %a, i64 %b, i64 %x) {
-; RV64-LABEL: add_shl_moreOneUse_inSelect:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    bgtz a3, .LBB5_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a5, a2
-; RV64-NEXT:  .LBB5_2: # %entry
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sd a5, 40(a0)
-; RV64-NEXT:    sd a5, 48(a0)
-; RV64-NEXT:    sd a4, 280(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB5_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB5_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 3
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sd a5, 40(a0)
+; NO-ZBA-NEXT:    sd a5, 48(a0)
+; NO-ZBA-NEXT:    sd a4, 280(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB5_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB5_2: # %entry
+; ZBA-NEXT:    sh3add a0, a1, a0
+; ZBA-NEXT:    sd a5, 40(a0)
+; ZBA-NEXT:    sd a5, 48(a0)
+; ZBA-NEXT:    sd a4, 280(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i64 %a, 5
   %cmp = icmp sgt i64 %x, 0
@@ -180,3 +259,103 @@ entry:
   store i64 %add, ptr %arrayidx6
   ret void
 }
+
+define i64 @add_shl_moreOneUse_sh1add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 1
+; NO-ZBA-NEXT:    ori a0, a0, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 1
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_moreOneUse_sh2add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 2
+; NO-ZBA-NEXT:    ori a0, a0, 4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh2add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 2
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_moreOneUse_sh3add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    ori a0, a0, 8
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh3add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 3
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+;; Covers a case which previously crashed (pr119527)
+define i64 @add_shl_sext(i32 %1) {
+; RV64-LABEL: add_shl_sext:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi a1, a0, 3
+; RV64-NEXT:    sllw a0, a1, a0
+; RV64-NEXT:    ret
+  %3 = add i32 %1, 3
+  %4 = shl i32 %3, %1
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+
+define i64 @add_shl_moreOneUse_sh4add(i64 %x) {
+; RV64-LABEL: add_shl_moreOneUse_sh4add:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ori a1, a0, 1
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    ori a0, a0, 16
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 4
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_rhs_constant(i64 %x, i64 %y) {
+; RV64-LABEL: add_shl_rhs_constant:
+; RV64:       # %bb.0:
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    ret
+  %a = add i64 %x, 1
+  %b = add i64 %y, %a
+  %c = shl i64 %b, 3
+  %d = add i64 %c, -8
+  ret i64 %d
+}
diff --git a/llvm/test/CodeGen/RISCV/add_shl_constant.ll b/llvm/test/CodeGen/RISCV/add_shl_constant.ll
index 71b6186..a4da9e2 100644
--- a/llvm/test/CodeGen/RISCV/add_shl_constant.ll
+++ b/llvm/test/CodeGen/RISCV/add_shl_constant.ll
@@ -1,13 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=riscv32  < %s | FileCheck -check-prefix=RV32 %s
+; RUN: llc -mtriple=riscv32  < %s | FileCheck -check-prefixes=RV32,NO-ZBA %s
+; RUN: llc -mtriple=riscv32 -mattr=+zba  < %s | FileCheck -check-prefixes=RV32,ZBA %s
 
 define i32 @add_shl_oneUse(i32 %x, i32 %y) nounwind {
-; RV32-LABEL: add_shl_oneUse:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    addi a0, a0, 984
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_oneUse:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    addi a0, a0, 984
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_oneUse:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    addi a0, a0, 123
+; ZBA-NEXT:    sh3add a0, a0, a1
+; ZBA-NEXT:    ret
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -15,15 +22,24 @@ define i32 @add_shl_oneUse(i32 %x, i32 %y) nounwind {
 }
 
 define void @add_shl_moreOneUse_inStore(ptr %array1, i32 %a, i32 %b)  {
-; RV32-LABEL: add_shl_moreOneUse_inStore:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a3, a1, 5
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sw a2, 20(a0)
-; RV32-NEXT:    sw a2, 24(a0)
-; RV32-NEXT:    sw a3, 140(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inStore:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sw a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inStore:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sw a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %arrayidx = getelementptr inbounds i32, ptr %array1, i32 %add
@@ -37,18 +53,30 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inStore_addexceedsign12(ptr %array1, i32 %a, i32 %b)  {
-; RV32-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a3, a1, 2047
-; RV32-NEXT:    lui a4, 2
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    addi a3, a3, 1
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw a2, 120(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 2047
+; NO-ZBA-NEXT:    lui a4, 2
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    addi a3, a3, 1
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    add a0, a0, a4
+; NO-ZBA-NEXT:    sw a2, 0(a0)
+; NO-ZBA-NEXT:    sw a3, 4(a0)
+; NO-ZBA-NEXT:    sw a2, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 2047
+; ZBA-NEXT:    lui a4, 2
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    addi a3, a3, 1
+; ZBA-NEXT:    add a0, a0, a4
+; ZBA-NEXT:    sw a2, 0(a0)
+; ZBA-NEXT:    sw a3, 4(a0)
+; ZBA-NEXT:    sw a2, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %arrayidx = getelementptr inbounds i32, ptr %array1, i32 %add
@@ -62,20 +90,34 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect(ptr %array1, i32 %a, i32 %b, i32 %x) {
-; RV32-LABEL: add_shl_moreOneUse_inSelect:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a4, a1, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    bgtz a3, .LBB3_2
-; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    mv a5, a2
-; RV32-NEXT:  .LBB3_2: # %entry
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sw a5, 20(a0)
-; RV32-NEXT:    sw a5, 24(a0)
-; RV32-NEXT:    sw a4, 140(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB3_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB3_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a5, 20(a0)
+; NO-ZBA-NEXT:    sw a5, 24(a0)
+; NO-ZBA-NEXT:    sw a4, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB3_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB3_2: # %entry
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a5, 20(a0)
+; ZBA-NEXT:    sw a5, 24(a0)
+; ZBA-NEXT:    sw a4, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %cmp = icmp sgt i32 %x, 0
@@ -91,23 +133,40 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect_addexceedsign12(ptr %array1, i32 %a, i32 %b, i32 %x) {
-; RV32-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a4, a1, 2047
-; RV32-NEXT:    addi a4, a4, 1
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    bgtz a3, .LBB4_2
-; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    mv a5, a2
-; RV32-NEXT:  .LBB4_2: # %entry
-; RV32-NEXT:    lui a2, 2
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sw a5, 0(a0)
-; RV32-NEXT:    sw a5, 4(a0)
-; RV32-NEXT:    sw a4, 120(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 2047
+; NO-ZBA-NEXT:    addi a4, a4, 1
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB4_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB4_2: # %entry
+; NO-ZBA-NEXT:    lui a2, 2
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    add a0, a0, a2
+; NO-ZBA-NEXT:    sw a5, 0(a0)
+; NO-ZBA-NEXT:    sw a5, 4(a0)
+; NO-ZBA-NEXT:    sw a4, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 2047
+; ZBA-NEXT:    addi a4, a4, 1
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB4_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB4_2: # %entry
+; ZBA-NEXT:    lui a2, 2
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a2
+; ZBA-NEXT:    sw a5, 0(a0)
+; ZBA-NEXT:    sw a5, 4(a0)
+; ZBA-NEXT:    sw a4, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %cmp = icmp sgt i32 %x, 0
@@ -121,3 +180,77 @@ entry:
   store i32 %add, ptr %arrayidx6
   ret void
 }
+
+define i32 @add_shl_moreOneUse_sh1add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 1
+; NO-ZBA-NEXT:    ori a0, a0, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 1
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh2add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 2
+; NO-ZBA-NEXT:    ori a0, a0, 4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh2add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 2
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh3add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    ori a0, a0, 8
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh3add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 3
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh4add(i32 %x) {
+; RV32-LABEL: add_shl_moreOneUse_sh4add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ori a1, a0, 1
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    ori a0, a0, 16
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 4
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f63bc94..c0fcc6f 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -82,6 +82,9 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV32XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcia %s -o - | FileCheck --check-prefix=RV32XQCIA %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciac %s -o - | FileCheck --check-prefix=RV32XQCIAC %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicli %s -o - | FileCheck --check-prefix=RV32XQCICLI %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm %s -o - | FileCheck --check-prefix=RV32XQCICM %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s
@@ -294,6 +297,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+supm %s -o - | FileCheck --check-prefix=RV64SUPM %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-smctr  %s -o - | FileCheck --check-prefix=RV64SMCTR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssctr  %s -o - | FileCheck --check-prefix=RV64SSCTR %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-sdext  %s -o - | FileCheck --check-prefix=RV64SDEXT %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-sdtrig  %s -o - | FileCheck --check-prefix=RV64SDTRIG %s
 
 ; Tests for profile features.
 ; RUN: llc -mtriple=riscv32 -mattr=+rvi20u32 %s -o - | FileCheck --check-prefix=RVI20U32 %s
@@ -391,6 +396,9 @@
 ; RV32XTHEADSYNC: .attribute 5, "rv32i2p1_xtheadsync1p0"
 ; RV32XWCHC: .attribute 5, "rv32i2p1_xwchc2p2"
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p2"
+; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p2"
+; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p2"
+; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
 ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2"
 ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2"
@@ -601,6 +609,8 @@
 ; RV64SUPM: .attribute 5, "rv64i2p1_supm1p0"
 ; RV64SMCTR: .attribute 5, "rv64i2p1_smctr1p0_sscsrind1p0"
 ; RV64SSCTR: .attribute 5, "rv64i2p1_sscsrind1p0_ssctr1p0"
+; RV64SDEXT: .attribute 5, "rv64i2p1_sdext1p0"
+; RV64SDTRIG: .attribute 5, "rv64i2p1_sdtrig1p0"
 
 ; RVI20U32: .attribute 5, "rv32i2p1"
 ; RVI20U64: .attribute 5, "rv64i2p1"
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 99db90c..70fbda4 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -15,6 +15,8 @@
 ; CHECK:   e                                - 'E' (Embedded Instruction Set with 16 GPRs).
 ; CHECK:   experimental                     - Experimental intrinsics.
 ; CHECK:   experimental-rvm23u32            - RISC-V experimental-rvm23u32 profile.
+; CHECK:   experimental-sdext               - 'Sdext' (External debugger).
+; CHECK:   experimental-sdtrig              - 'Sdtrig' (Debugger triggers).
 ; CHECK:   experimental-smctr               - 'Smctr' (Control Transfer Records Machine Level).
 ; CHECK:   experimental-ssctr               - 'Ssctr' (Control Transfer Records Supervisor Level).
 ; CHECK:   experimental-svukte              - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses).
diff --git a/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll b/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
index 4c47b5f..2c428cf 100644
--- a/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
+++ b/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
@@ -20,7 +20,7 @@ define void @f2(ptr noundef %x) #0 {
   ; CHECK-NEXT:   liveins: $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprtc = COPY $x10
-  ; CHECK-NEXT:   PseudoTAILIndirect [[COPY]], implicit $x2, cfi-type 12345678
+  ; CHECK-NEXT:   PseudoTAILIndirect [[COPY]], csr_ilp32_lp64, implicit $x2, cfi-type 12345678
   tail call void %x() [ "kcfi"(i32 12345678) ]
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/kcfi-mir.ll b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
index f9f383a..0c0d39a 100644
--- a/llvm/test/CodeGen/RISCV/kcfi-mir.ll
+++ b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
@@ -30,7 +30,7 @@ define void @f2(ptr noundef %x) #0 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   BUNDLE implicit-def $x6, implicit-def $x6_w, implicit-def $x6_h, implicit-def $x7, implicit-def $x7_w, implicit-def $x7_h, implicit-def $x28, implicit-def $x28_w, implicit-def $x28_h, implicit-def $x29, implicit-def $x29_w, implicit-def $x29_h, implicit-def $x30, implicit-def $x30_w, implicit-def $x30_h, implicit-def $x31, implicit-def $x31_w, implicit-def $x31_h, implicit killed $x10, implicit $x2 {
   ; CHECK-NEXT:     KCFI_CHECK $x10, 12345678, implicit-def $x6, implicit-def $x7, implicit-def $x28, implicit-def $x29, implicit-def $x30, implicit-def $x31
-  ; CHECK-NEXT:     PseudoTAILIndirect killed $x10, implicit $x2
+  ; CHECK-NEXT:     PseudoTAILIndirect killed $x10, csr_ilp32_lp64, implicit $x2
   ; CHECK-NEXT:   }
   tail call void %x() [ "kcfi"(i32 12345678) ]
   ret void
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index d529ae6..f9086ba9 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2449,82 +2449,72 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3:
@@ -2658,9 +2648,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_4:
@@ -2671,9 +2661,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_4:
@@ -2682,9 +2672,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_4:
@@ -2695,9 +2685,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_4:
@@ -2845,22 +2835,19 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB26_2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB26_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5:
@@ -2883,22 +2870,17 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB26_2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB26_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5:
@@ -3052,28 +3034,19 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB27_3: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a3, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6:
@@ -3102,28 +3075,17 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB27_3: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6:
@@ -3500,9 +3462,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_8:
@@ -3533,9 +3495,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_8:
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index 860c3a9..f029029 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -2710,6 +2710,216 @@ entry:
   ret i1 %ret
 }
 
+define i1 @bcmp_le_zero(ptr %s1, ptr %s2) nounwind {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-LABEL: bcmp_le_zero:
+; CHECK-UNALIGNED:       # %bb.0: # %entry
+; CHECK-UNALIGNED-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4)
+  %ret = icmp slt i32 %bcmp, 1
+  ret i1 %ret
+}
+
+define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-V-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-LABEL: bcmp_ge_zero:
+; CHECK-UNALIGNED:       # %bb.0: # %entry
+; CHECK-UNALIGNED-NEXT:    li a0, 1
+; CHECK-UNALIGNED-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4)
+  %ret = icmp sgt i32 %bcmp, -1
+  ret i1 %ret
+}
+
 define i32 @memcmp_size_0(ptr %s1, ptr %s2) nounwind {
 ; CHECK-LABEL: memcmp_size_0:
 ; CHECK:       # %bb.0: # %entry
@@ -3145,82 +3355,72 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3:
@@ -3354,9 +3554,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_4:
@@ -3367,9 +3567,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_4:
@@ -3378,9 +3578,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_4:
@@ -3391,9 +3591,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_4:
@@ -3527,13 +3727,13 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB26_2
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB28_2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB26_2: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB28_2: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -3541,22 +3741,19 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB26_2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB26_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5:
@@ -3565,13 +3762,13 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB26_2
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB28_2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB26_2: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB28_2: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -3579,22 +3776,17 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB26_2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB26_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5:
@@ -3728,7 +3920,7 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB27_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB29_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 4(a1)
@@ -3736,11 +3928,11 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB27_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB29_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB27_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB29_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -3748,28 +3940,19 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB27_3: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a3, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6:
@@ -3778,7 +3961,7 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB27_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB29_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 4(a1)
@@ -3786,11 +3969,11 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB27_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB29_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB27_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB29_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -3798,28 +3981,17 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB27_3: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6:
@@ -3953,17 +4125,17 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB28_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB30_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -3977,7 +4149,7 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 3(a1)
@@ -3985,11 +4157,11 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB28_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB30_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -4001,17 +4173,17 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB28_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB30_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4025,7 +4197,7 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 3(a1)
@@ -4033,11 +4205,11 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB28_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB30_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -4174,17 +4346,17 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB29_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB29_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB29_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB31_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -4196,9 +4368,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_8:
@@ -4207,17 +4379,17 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB29_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB29_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB29_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB31_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4229,9 +4401,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_8:
@@ -4365,29 +4537,29 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 11(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 11(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB30_5: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB32_5: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -4399,17 +4571,17 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB30_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 7(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 7(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB30_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB30_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB32_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -4421,29 +4593,29 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 11(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 11(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB30_5: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB32_5: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4455,17 +4627,17 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB30_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 7(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 7(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB30_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB30_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB32_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -4602,29 +4774,29 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB31_5: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB33_5: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -4636,17 +4808,17 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB31_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB31_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB31_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB33_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -4658,29 +4830,29 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB31_5: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB33_5: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4692,17 +4864,17 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB31_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB31_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB31_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB33_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -4839,53 +5011,53 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 16(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 16(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 20(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 20(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 24(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 27(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB32_9: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB34_9: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -4897,29 +5069,29 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 23(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 23(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB32_5: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB34_5: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -4931,53 +5103,53 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 16(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 16(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 20(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 20(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 24(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 27(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB32_9: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB34_9: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4989,29 +5161,29 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 23(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 23(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB32_5: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB34_5: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -5148,53 +5320,53 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 16(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 16(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 20(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 20(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 24(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 28(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB33_9: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB35_9: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -5206,29 +5378,29 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB33_5: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB35_5: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -5240,53 +5412,53 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 16(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 16(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 20(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 20(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 24(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 28(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB33_9: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB35_9: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -5298,29 +5470,29 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB33_5: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB35_5: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -5417,53 +5589,53 @@ define i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 32(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 32(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 40(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 40(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 48(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 55(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB34_9: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB36_9: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -5475,53 +5647,53 @@ define i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 32(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 32(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 40(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 40(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 48(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 55(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB34_9: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB36_9: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -5608,53 +5780,53 @@ define i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 32(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 32(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 40(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 40(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 48(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 56(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB35_9: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB37_9: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -5666,53 +5838,53 @@ define i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 32(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 32(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 40(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 40(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 48(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 56(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB35_9: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB37_9: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -6374,5 +6546,381 @@ entry:
   %ret = icmp sgt i32 %memcmp, 0
   ret i1 %ret
 }
+
+define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind {
+; CHECK-ALIGNED-RV32-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV32-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV32-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV64-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV64-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV32-V-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV32-V-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV64-V-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV64-V-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4)
+  %ret = icmp slt i32 %memcmp, 1
+  ret i1 %ret
+}
+
+define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
+; CHECK-ALIGNED-RV32-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-V-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV32-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV32-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV64-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV32-V-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV64-V-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4)
+  %ret = icmp sgt i32 %memcmp, -1
+  ret i1 %ret
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-ALIGNED: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index 7d6a6d7..fe19a4fa 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -258,3 +258,447 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
   %neg = sub nsw i64 0, %abs
   ret i64 %neg
 }
+
+define i32 @expanded_neg_abs32(i32 %x) {
+; RV32I-LABEL: expanded_neg_abs32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    blt a0, a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    min a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    blt a1, a0, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB6_2:
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.smax.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i32 @expanded_neg_abs32_unsigned(i32 %x) {
+; RV32I-LABEL: expanded_neg_abs32_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    bltu a0, a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs32_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    minu a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs32_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    bltu a1, a0, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB7_2:
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs32_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.umax.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i64 @expanded_neg_abs64(i64 %x) {
+; RV32I-LABEL: expanded_neg_abs64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a2
+; RV32I-NEXT:    beqz a4, .LBB8_3
+; RV32I-NEXT:    j .LBB8_4
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    bnez a4, .LBB8_4
+; RV32I-NEXT:  .LBB8_3:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:  .LBB8_4:
+; RV32I-NEXT:    snez a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB8_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a4, a1, a2
+; RV32ZBB-NEXT:    beqz a4, .LBB8_3
+; RV32ZBB-NEXT:    j .LBB8_4
+; RV32ZBB-NEXT:  .LBB8_2:
+; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    bnez a4, .LBB8_4
+; RV32ZBB-NEXT:  .LBB8_3:
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:  .LBB8_4:
+; RV32ZBB-NEXT:    snez a0, a3
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    blt a0, a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB8_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    min a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.smax.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i64 @expanded_neg_abs64_unsigned(i64 %x) {
+; RV32I-LABEL: expanded_neg_abs64_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a2
+; RV32I-NEXT:    beqz a4, .LBB9_3
+; RV32I-NEXT:    j .LBB9_4
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    bnez a4, .LBB9_4
+; RV32I-NEXT:  .LBB9_3:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:  .LBB9_4:
+; RV32I-NEXT:    snez a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs64_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB9_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a4, a1, a2
+; RV32ZBB-NEXT:    beqz a4, .LBB9_3
+; RV32ZBB-NEXT:    j .LBB9_4
+; RV32ZBB-NEXT:  .LBB9_2:
+; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    bnez a4, .LBB9_4
+; RV32ZBB-NEXT:  .LBB9_3:
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:  .LBB9_4:
+; RV32ZBB-NEXT:    snez a0, a3
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs64_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    bltu a0, a1, .LBB9_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB9_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs64_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    minu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.umax.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i32 @expanded_neg_inv_abs32(i32 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    blt a1, a0, .LBB10_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    max a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    blt a0, a1, .LBB10_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB10_2:
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    min a0, a0, a1
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.smin.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    bltu a1, a0, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB11_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    maxu a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    bltu a0, a1, .LBB11_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB11_2:
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    minu a0, a0, a1
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.umin.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i64 @expanded_neg_inv_abs64(i64 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB12_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a2, a1
+; RV32I-NEXT:    beqz a4, .LBB12_3
+; RV32I-NEXT:    j .LBB12_4
+; RV32I-NEXT:  .LBB12_2:
+; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    bnez a4, .LBB12_4
+; RV32I-NEXT:  .LBB12_3:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:  .LBB12_4:
+; RV32I-NEXT:    snez a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB12_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a4, a2, a1
+; RV32ZBB-NEXT:    beqz a4, .LBB12_3
+; RV32ZBB-NEXT:    j .LBB12_4
+; RV32ZBB-NEXT:  .LBB12_2:
+; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    bnez a4, .LBB12_4
+; RV32ZBB-NEXT:  .LBB12_3:
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:  .LBB12_4:
+; RV32ZBB-NEXT:    snez a0, a3
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    blt a1, a0, .LBB12_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB12_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.smin.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB13_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a2, a1
+; RV32I-NEXT:    beqz a4, .LBB13_3
+; RV32I-NEXT:    j .LBB13_4
+; RV32I-NEXT:  .LBB13_2:
+; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    bnez a4, .LBB13_4
+; RV32I-NEXT:  .LBB13_3:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:  .LBB13_4:
+; RV32I-NEXT:    snez a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB13_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a4, a2, a1
+; RV32ZBB-NEXT:    beqz a4, .LBB13_3
+; RV32ZBB-NEXT:    j .LBB13_4
+; RV32ZBB-NEXT:  .LBB13_2:
+; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    bnez a4, .LBB13_4
+; RV32ZBB-NEXT:  .LBB13_3:
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:  .LBB13_4:
+; RV32ZBB-NEXT:    snez a0, a3
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    bltu a1, a0, .LBB13_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB13_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.umin.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
index 332e497..44ab0e1 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
 ; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV32I
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32I
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+xtheadba -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV32XTHEADBA
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32XTHEADBA
 
 define signext i16 @th_addsl_1(i64 %0, ptr %1) {
 ; RV32I-LABEL: th_addsl_1:
@@ -324,3 +324,563 @@ define i32 @mul288(i32 %a) {
   %c = mul i32 %a, 288
   ret i32 %c
 }
+
+define i32 @mul258(i32 %a) {
+; RV32I-LABEL: mul258:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 258
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul258:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 8
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 258
+  ret i32 %c
+}
+
+define i32 @mul260(i32 %a) {
+; RV32I-LABEL: mul260:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 260
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul260:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 8
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 260
+  ret i32 %c
+}
+
+define i32 @mul264(i32 %a) {
+; RV32I-LABEL: mul264:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 264
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul264:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 8
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 264
+  ret i32 %c
+}
+
+define i32 @mul11(i32 %a) {
+; RV32I-LABEL: mul11:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul11:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 11
+  ret i32 %c
+}
+
+define i32 @mul19(i32 %a) {
+; RV32I-LABEL: mul19:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 19
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul19:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 19
+  ret i32 %c
+}
+
+define i32 @mul13(i32 %a) {
+; RV32I-LABEL: mul13:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 13
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul13:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 1
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 13
+  ret i32 %c
+}
+
+define i32 @mul21(i32 %a) {
+; RV32I-LABEL: mul21:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 21
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul21:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 21
+  ret i32 %c
+}
+
+define i32 @mul37(i32 %a) {
+; RV32I-LABEL: mul37:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 37
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul37:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 37
+  ret i32 %c
+}
+
+define i32 @mul25(i32 %a) {
+; RV32I-LABEL: mul25:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 25
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul25:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 25
+  ret i32 %c
+}
+
+define i32 @mul41(i32 %a) {
+; RV32I-LABEL: mul41:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 41
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul41:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 41
+  ret i32 %c
+}
+
+define i32 @mul73(i32 %a) {
+; RV32I-LABEL: mul73:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 73
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul73:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 73
+  ret i32 %c
+}
+
+define i32 @mul27(i32 %a) {
+; RV32I-LABEL: mul27:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 27
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul27:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 27
+  ret i32 %c
+}
+
+define i32 @mul45(i32 %a) {
+; RV32I-LABEL: mul45:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 45
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul45:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 45
+  ret i32 %c
+}
+
+define i32 @mul81(i32 %a) {
+; RV32I-LABEL: mul81:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 81
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul81:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 81
+  ret i32 %c
+}
+
+define i32 @mul4098(i32 %a) {
+; RV32I-LABEL: mul4098:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 1
+; RV32I-NEXT:    slli a0, a0, 12
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul4098:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 12
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 4098
+  ret i32 %c
+}
+
+define i32 @mul4100(i32 %a) {
+; RV32I-LABEL: mul4100:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    slli a0, a0, 12
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul4100:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 12
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 4100
+  ret i32 %c
+}
+
+define i32 @mul4104(i32 %a) {
+; RV32I-LABEL: mul4104:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 3
+; RV32I-NEXT:    slli a0, a0, 12
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul4104:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 12
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 4104
+  ret i32 %c
+}
+
+define i32 @add4104(i32 %a) {
+; RV32I-LABEL: add4104:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a1, a1, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: add4104:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    li a1, 1026
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = add i32 %a, 4104
+  ret i32 %c
+}
+
+define i32 @add8208(i32 %a) {
+; RV32I-LABEL: add8208:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 2
+; RV32I-NEXT:    addi a1, a1, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: add8208:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    li a1, 1026
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = add i32 %a, 8208
+  ret i32 %c
+}
+
+define i32 @add8192(i32 %a) {
+; CHECK-LABEL: add8192:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = add i32 %a, 8192
+  ret i32 %c
+}
+
+define i32 @addshl_5_6(i32 %a, i32 %b) {
+; CHECK-LABEL: addshl_5_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 6
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 6
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i32 @addshl_5_7(i32 %a, i32 %b) {
+; CHECK-LABEL: addshl_5_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 7
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 7
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i32 @addshl_5_8(i32 %a, i32 %b) {
+; CHECK-LABEL: addshl_5_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 8
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 8
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i32 @srli_1_sh2add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_1_sh2add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_1_sh2add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 1
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = getelementptr inbounds i32, ptr %0, i32 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_2_sh3add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_2_sh3add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_2_sh3add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 2
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a1, 3
+; RV32XTHEADBA-NEXT:    lw a0, 0(a1)
+; RV32XTHEADBA-NEXT:    lw a1, 4(a1)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = getelementptr inbounds i64, ptr %0, i32 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define signext i16 @srli_2_sh1add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_2_sh1add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lh a0, 0(a0)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_2_sh1add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV32XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = getelementptr inbounds i16, ptr %0, i32 %3
+  %5 = load i16, ptr %4, align 2
+  ret i16 %5
+}
+
+define i32 @srli_3_sh2add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_3_sh2add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_3_sh2add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 3
+  %4 = getelementptr inbounds i32, ptr %0, i32 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_4_sh3add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_4_sh3add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_4_sh3add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 4
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a1, 3
+; RV32XTHEADBA-NEXT:    lw a0, 0(a1)
+; RV32XTHEADBA-NEXT:    lw a1, 4(a1)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 4
+  %4 = getelementptr inbounds i64, ptr %0, i32 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define i32 @mul_neg1(i32 %a) {
+; CHECK-LABEL: mul_neg1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -1
+  ret i32 %c
+}
+
+define i32 @mul_neg2(i32 %a) {
+; CHECK-LABEL: mul_neg2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -2
+  ret i32 %c
+}
+
+define i32 @mul_neg3(i32 %a) {
+; RV32I-LABEL: mul_neg3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul_neg3:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV32XTHEADBA-NEXT:    neg a0, a0
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, -3
+  ret i32 %c
+}
+
+define i32 @mul_neg4(i32 %a) {
+; CHECK-LABEL: mul_neg4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -4
+  ret i32 %c
+}
+
+define i32 @mul_neg5(i32 %a) {
+; RV32I-LABEL: mul_neg5:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul_neg5:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV32XTHEADBA-NEXT:    neg a0, a0
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, -5
+  ret i32 %c
+}
+
+define i32 @mul_neg6(i32 %a) {
+; CHECK-LABEL: mul_neg6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -6
+  ret i32 %c
+}
+
+define i32 @mul_neg7(i32 %a) {
+; CHECK-LABEL: mul_neg7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -7
+  ret i32 %c
+}
+
+define i32 @mul_neg8(i32 %a) {
+; CHECK-LABEL: mul_neg8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -8
+  ret i32 %c
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index 2d44ffb..2272c17b 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
 ; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV64I
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64I
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+xtheadba -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV64XTHEADBA
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64XTHEADBA
 
 define signext i16 @th_addsl_1(i64 %0, ptr %1) {
 ; RV64I-LABEL: th_addsl_1:
@@ -109,6 +109,25 @@ define i64 @addmul6(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @disjointormul6(i64 %a, i64 %b) {
+; RV64I-LABEL: disjointormul6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: disjointormul6:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 6
+  %d = or disjoint i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul10(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul10:
 ; RV64I:       # %bb.0:
@@ -182,6 +201,18 @@ define i64 @addmul20(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul22(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul22:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 22
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 22
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul24(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul24:
 ; RV64I:       # %bb.0:
@@ -255,6 +286,461 @@ define i64 @addmul72(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @mul50(i64 %a) {
+; RV64I-LABEL: mul50:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 50
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul50:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 50
+  ret i64 %c
+}
+
+define i64 @addmul50(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul50:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 50
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul50:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 50
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul100(i64 %a) {
+; RV64I-LABEL: mul100:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 100
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul100:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 100
+  ret i64 %c
+}
+
+define i64 @addmul100(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul100:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 100
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul100:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 100
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul162(i64 %a) {
+; RV64I-LABEL: mul162:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 162
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul162:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 162
+  ret i64 %c
+}
+
+define i64 @addmul162(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul162:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 162
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul162:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 162
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul180(i64 %a) {
+; RV64I-LABEL: mul180:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 180
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul180:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 180
+  ret i64 %c
+}
+
+define i64 @addmul180(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul180:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 180
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul180:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 180
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @add255mul180(i64 %a) {
+; RV64I-LABEL: add255mul180:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 180
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    addi a0, a0, 255
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: add255mul180:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 2
+; RV64XTHEADBA-NEXT:    addi a0, a0, 255
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 180
+  %d = add i64 %c, 255
+  ret i64 %d
+}
+
+define i64 @mul200(i64 %a) {
+; RV64I-LABEL: mul200:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 200
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul200:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 3
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 200
+  ret i64 %c
+}
+
+define i64 @addmul200(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul200:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 200
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul200:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 200
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul4096(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul4096:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 12
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 4096
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul4230(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul4230:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addiw a2, a2, 134
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 4230
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul96(i64 %a) {
+; RV64I-LABEL: mul96:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 5
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul96:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 96
+  ret i64 %c
+}
+
+define i64 @mul119(i64 %a) {
+; RV64I-LABEL: mul119:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 119
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul119:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    sub a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 119
+  ret i64 %c
+}
+
+define i64 @mul123(i64 %a) {
+; RV64I-LABEL: mul123:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 123
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul123:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    sub a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 123
+  ret i64 %c
+}
+
+define i64 @mul125(i64 %a) {
+; RV64I-LABEL: mul125:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 125
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul125:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 1
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    sub a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 125
+  ret i64 %c
+}
+
+define i64 @mul131(i64 %a) {
+; RV64I-LABEL: mul131:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 131
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul131:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 1
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 131
+  ret i64 %c
+}
+
+define i64 @mul133(i64 %a) {
+; RV64I-LABEL: mul133:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 133
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul133:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 133
+  ret i64 %c
+}
+
+define i64 @mul137(i64 %a) {
+; RV64I-LABEL: mul137:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 137
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul137:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 137
+  ret i64 %c
+}
+
+define i64 @mul160(i64 %a) {
+; RV64I-LABEL: mul160:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 160
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul160:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 160
+  ret i64 %c
+}
+
+define i64 @mul288(i64 %a) {
+; RV64I-LABEL: mul288:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 288
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul288:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 288
+  ret i64 %c
+}
+
+define i64 @sh1add_imm(i64 %0) {
+; CHECK-LABEL: sh1add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, 5
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 1
+  %b = add i64 %a, 5
+  ret i64 %b
+}
+
+define i64 @sh2add_imm(i64 %0) {
+; CHECK-LABEL: sh2add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -6
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 2
+  %b = add i64 %a, -6
+  ret i64 %b
+}
+
+define i64 @sh3add_imm(i64 %0) {
+; CHECK-LABEL: sh3add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, 7
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 3
+  %b = add i64 %a, 7
+  ret i64 %b
+}
+
+define i64 @mul258(i64 %a) {
+; RV64I-LABEL: mul258:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 258
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul258:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a0, 8
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 258
+  ret i64 %c
+}
+
+define i64 @mul260(i64 %a) {
+; RV64I-LABEL: mul260:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 260
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul260:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a0, 8
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 260
+  ret i64 %c
+}
+
+define i64 @mul264(i64 %a) {
+; RV64I-LABEL: mul264:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 264
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul264:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a0, 8
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 264
+  ret i64 %c
+}
 define i64 @mul11(i64 %a) {
 ; RV64I-LABEL: mul11:
 ; RV64I:       # %bb.0:
@@ -431,86 +917,1148 @@ define i64 @mul81(i64 %a) {
   ret i64 %c
 }
 
+define i64 @mul4098(i64 %a) {
+; RV64I-LABEL: mul4098:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul4098:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a0, 12
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 4098
+  ret i64 %c
+}
 
-define i64 @mul96(i64 %a) {
-; RV64I-LABEL: mul96:
+define i64 @mul4100(i64 %a) {
+; RV64I-LABEL: mul4100:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 5
-; RV64I-NEXT:    slli a0, a0, 7
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul96:
+; RV64XTHEADBA-LABEL: mul4100:
 ; RV64XTHEADBA:       # %bb.0:
-; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
-; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    slli a1, a0, 12
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 96
+  %c = mul i64 %a, 4100
   ret i64 %c
 }
 
-define i64 @mul137(i64 %a) {
-; RV64I-LABEL: mul137:
+define i64 @mul4104(i64 %a) {
+; RV64I-LABEL: mul4104:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a1, 137
-; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 3
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul137:
+; RV64XTHEADBA-LABEL: mul4104:
 ; RV64XTHEADBA:       # %bb.0:
-; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
-; RV64XTHEADBA-NEXT:    slli a0, a0, 7
-; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    slli a1, a0, 12
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 137
+  %c = mul i64 %a, 4104
   ret i64 %c
 }
 
-define i64 @mul160(i64 %a) {
-; RV64I-LABEL: mul160:
+define signext i32 @mulw192(i32 signext %a) {
+; RV64I-LABEL: mulw192:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a1, 160
-; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 6
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul160:
+; RV64XTHEADBA-LABEL: mulw192:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 6
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 192
+  ret i32 %c
+}
+
+define signext i32 @mulw320(i32 signext %a) {
+; RV64I-LABEL: mulw320:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 320
+; RV64I-NEXT:    mulw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mulw320:
 ; RV64XTHEADBA:       # %bb.0:
 ; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
-; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 6
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 160
+  %c = mul i32 %a, 320
+  ret i32 %c
+}
+
+define signext i32 @mulw576(i32 signext %a) {
+; RV64I-LABEL: mulw576:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 576
+; RV64I-NEXT:    mulw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mulw576:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 6
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 576
+  ret i32 %c
+}
+
+define i64 @add4104(i64 %a) {
+; RV64I-LABEL: add4104:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: add4104:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    li a1, 1026
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = add i64 %a, 4104
   ret i64 %c
 }
 
-define i64 @mul200(i64 %a) {
-; RV64I-LABEL: mul200:
+define i64 @add4104_2(i64 %a) {
+; RV64I-LABEL: add4104_2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a1, 200
-; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 8
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul200:
+; RV64XTHEADBA-LABEL: add4104_2:
 ; RV64XTHEADBA:       # %bb.0:
-; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
-; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
-; RV64XTHEADBA-NEXT:    slli a0, a0, 3
+; RV64XTHEADBA-NEXT:    li a1, 1026
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 200
+  %c = or disjoint i64 %a, 4104
   ret i64 %c
 }
 
-define i64 @mul288(i64 %a) {
-; RV64I-LABEL: mul288:
+define i64 @add8208(i64 %a) {
+; RV64I-LABEL: add8208:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a1, 288
-; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    lui a1, 2
+; RV64I-NEXT:    addiw a1, a1, 16
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul288:
+; RV64XTHEADBA-LABEL: add8208:
 ; RV64XTHEADBA:       # %bb.0:
-; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
-; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    li a1, 1026
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 288
+  %c = add i64 %a, 8208
+  ret i64 %c
+}
+
+; Make sure we prefer LUI for the 8192 instead of using sh3add.
+define signext i32 @add8192_i32(i32 signext %a) {
+; CHECK-LABEL: add8192_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = add i32 %a, 8192
+  ret i32 %c
+}
+
+; Make sure we prefer LUI for the 8192 instead of using sh3add.
+define i64 @add8192(i64 %a) {
+; CHECK-LABEL: add8192:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = add i64 %a, 8192
+  ret i64 %c
+}
+
+define signext i32 @addshl32_5_6(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: addshl32_5_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 6
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 6
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_6(i64 %a, i64 %b) {
+; CHECK-LABEL: addshl64_5_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 6
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 6
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+define signext i32 @addshl32_5_7(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: addshl32_5_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 7
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 7
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_7(i64 %a, i64 %b) {
+; CHECK-LABEL: addshl64_5_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 7
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 7
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+define signext i32 @addshl32_5_8(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: addshl32_5_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 8
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 8
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_8(i64 %a, i64 %b) {
+; CHECK-LABEL: addshl64_5_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 8
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 8
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+define i64 @sh6_sh3_add1(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: sh6_sh3_add1:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    th.addsl a1, a1, a2, 3
+; RV64XTHEADBA-NEXT:    add a0, a1, a0
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %shl
+  %add2 = add nsw i64 %add, %x
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add2(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: sh6_sh3_add2:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    add a0, a1, a0
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %x
+  %add2 = add nsw i64 %add, %shl
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add3(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: sh6_sh3_add3:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    th.addsl a1, a1, a2, 3
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %shl
+  %add2 = add nsw i64 %x, %add
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add4(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: sh6_sh3_add4:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %x, %shl
+  %add2 = add nsw i64 %add, %shl1
+  ret i64 %add2
+}
+
+define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) {
+; CHECK-LABEL: srliw_1_sh1add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a1, 1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    lh a0, 0(a0)
+; CHECK-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i16, ptr %0, i64 %4
+  %6 = load i16, ptr %5, align 2
+  ret i16 %6
+}
+
+define signext i32 @srliw_2_sh2add(ptr %0, i32 signext %1) {
+; CHECK-LABEL: srliw_2_sh2add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a1, 2
+; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    lw a0, 0(a0)
+; CHECK-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_3_sh3add(ptr %0, i32 signext %1) {
+; CHECK-LABEL: srliw_3_sh3add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ld a0, 0(a0)
+; CHECK-NEXT:    ret
+  %3 = lshr i32 %1, 3
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i32 @srliw_1_sh2add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_1_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_1_sh2add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_1_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_1_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_1_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define i64 @srliw_2_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_2_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 2
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_2_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i16 @srliw_2_sh1add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_2_sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 2
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_2_sh1add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i16, ptr %0, i64 %4
+  %6 = load i16, ptr %5, align 2
+  ret i16 %6
+}
+
+
+define signext i32 @srliw_3_sh2add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_3_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 3
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_3_sh2add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 3
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_4_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_4_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 4
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_4_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 4
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 4
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i32 @srli_1_sh2add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_1_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_1_sh2add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 1
+  %4 = getelementptr inbounds i32, ptr %0, i64 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_2_sh3add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_2_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_2_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 2
+  %4 = getelementptr inbounds i64, ptr %0, i64 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define signext i16 @srli_2_sh1add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_2_sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_2_sh1add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 2
+  %4 = getelementptr inbounds i16, ptr %0, i64 %3
+  %5 = load i16, ptr %4, align 2
+  ret i16 %5
+}
+
+define signext i32 @srli_3_sh2add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_3_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_3_sh2add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 3
+  %4 = getelementptr inbounds i32, ptr %0, i64 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_4_sh3add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_4_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_4_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 4
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 4
+  %4 = getelementptr inbounds i64, ptr %0, i64 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define i8 @array_index_sh1_sh0(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh1_sh0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh1_sh0:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV64XTHEADBA-NEXT:    add a0, a0, a2
+; RV64XTHEADBA-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [2 x i8], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i8, ptr %a, align 1
+  ret i8 %b
+}
+
+define i16 @array_index_sh1_sh1(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh1_sh1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh1_sh1:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [2 x i16], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i16, ptr %a, align 2
+  ret i16 %b
+}
+
+define i32 @array_index_sh1_sh2(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh1_sh2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh1_sh2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [2 x i32], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i32, ptr %a, align 4
+  ret i32 %b
+}
+
+define i64 @array_index_sh1_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh1_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh1_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 4
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [2 x i64], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+define i8 @array_index_sh2_sh0(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh2_sh0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh2_sh0:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    add a0, a0, a2
+; RV64XTHEADBA-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [4 x i8], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i8, ptr %a, align 1
+  ret i8 %b
+}
+
+define i16 @array_index_sh2_sh1(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh2_sh1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh2_sh1:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [4 x i16], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i16, ptr %a, align 2
+  ret i16 %b
+}
+
+define i32 @array_index_sh2_sh2(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh2_sh2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh2_sh2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 4
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [4 x i32], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i32, ptr %a, align 4
+  ret i32 %b
+}
+
+define i64 @array_index_sh2_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh2_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh2_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 5
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [4 x i64], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+define i8 @array_index_sh3_sh0(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh3_sh0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh3_sh0:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    add a0, a0, a2
+; RV64XTHEADBA-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [8 x i8], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i8, ptr %a, align 1
+  ret i8 %b
+}
+
+define i16 @array_index_sh3_sh1(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh3_sh1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh3_sh1:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 4
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [8 x i16], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i16, ptr %a, align 2
+  ret i16 %b
+}
+
+define i32 @array_index_sh3_sh2(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh3_sh2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh3_sh2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 5
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [8 x i32], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i32, ptr %a, align 4
+  ret i32 %b
+}
+
+define i64 @array_index_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh3_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh3_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [8 x i64], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+; Similar to above, but with a lshr on one of the indices. This requires
+; special handling during isel to form a shift pair.
+define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_lshr_sh3_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 58
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_lshr_sh3_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 58
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %shr = lshr i64 %idx1, 58
+  %a = getelementptr inbounds [8 x i64], ptr %p, i64 %shr, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+define i8 @array_index_sh4_sh0(ptr %p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: array_index_sh4_sh0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    lbu a0, 0(a0)
+; CHECK-NEXT:    ret
+  %a = getelementptr inbounds [16 x i8], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i8, ptr %a, align 1
+  ret i8 %b
+}
+
+define i16 @array_index_sh4_sh1(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh4_sh1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh4_sh1:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 5
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [16 x i16], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i16, ptr %a, align 2
+  ret i16 %b
+}
+
+define i32 @array_index_sh4_sh2(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh4_sh2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh4_sh2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [16 x i32], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i32, ptr %a, align 4
+  ret i32 %b
+}
+
+define i64 @array_index_sh4_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh4_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 7
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh4_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [16 x i64], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+define i64 @mul_neg1(i64 %a) {
+; CHECK-LABEL: mul_neg1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -1
+  ret i64 %c
+}
+
+define i64 @mul_neg2(i64 %a) {
+; CHECK-LABEL: mul_neg2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -2
+  ret i64 %c
+}
+
+define i64 @mul_neg3(i64 %a) {
+; RV64I-LABEL: mul_neg3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul_neg3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    neg a0, a0
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, -3
+  ret i64 %c
+}
+
+define i64 @mul_neg4(i64 %a) {
+; CHECK-LABEL: mul_neg4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -4
+  ret i64 %c
+}
+
+define i64 @mul_neg5(i64 %a) {
+; RV64I-LABEL: mul_neg5:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul_neg5:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    neg a0, a0
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, -5
   ret i64 %c
 }
+
+define i64 @mul_neg6(i64 %a) {
+; CHECK-LABEL: mul_neg6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -6
+  ret i64 %c
+}
+
+define i64 @mul_neg7(i64 %a) {
+; CHECK-LABEL: mul_neg7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -7
+  ret i64 %c
+}
+
+define i64 @mul_neg8(i64 %a) {
+; CHECK-LABEL: mul_neg8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -8
+  ret i64 %c
+}
+
+define ptr @srai_srli_sh3add(ptr %0, i64 %1) nounwind {
+; RV64I-LABEL: srai_srli_sh3add:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srai a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 6
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srai_srli_sh3add:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    srai a1, a1, 32
+; RV64XTHEADBA-NEXT:    srli a1, a1, 6
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %2 = ashr i64 %1, 32
+  %3 = lshr i64 %2, 6
+  %4 = getelementptr i64, ptr %0, i64 %3
+  ret ptr %4
+}
+
+define ptr @srai_srli_slli(ptr %0, i64 %1) nounwind {
+; CHECK-LABEL: srai_srli_slli:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    srai a1, a1, 32
+; CHECK-NEXT:    srli a1, a1, 6
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %2 = ashr i64 %1, 32
+  %3 = lshr i64 %2, 6
+  %4 = getelementptr i128, ptr %0, i64 %3
+  ret ptr %4
+}
+
+; Negative to make sure the peephole added for srai_srli_slli and
+; srai_srli_sh3add doesn't break this.
+define i64 @srai_andi(i64 %x) nounwind {
+; CHECK-LABEL: srai_andi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    srai a0, a0, 8
+; CHECK-NEXT:    andi a0, a0, -8
+; CHECK-NEXT:    ret
+entry:
+  %y = ashr i64 %x, 8
+  %z = and i64 %y, -8
+  ret i64 %z
+}
+
+; Negative to make sure the peephole added for srai_srli_slli and
+; srai_srli_sh3add doesn't break this.
+define i64 @srai_lui_and(i64 %x) nounwind {
+; CHECK-LABEL: srai_lui_and:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    srai a0, a0, 8
+; CHECK-NEXT:    lui a1, 1048574
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %y = ashr i64 %x, 8
+  %z = and i64 %y, -8192
+  ret i64 %z
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 10d2492..4d34621 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -1445,10 +1445,9 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64(<vscale x 1 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v11, v8, a5, v0.t
@@ -1595,9 +1594,7 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va
 ; RV32-NEXT:    vand.vx v13, v8, a1
 ; RV32-NEXT:    vand.vx v12, v12, a1
 ; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v13, v13, a4
 ; RV32-NEXT:    vor.vv v10, v10, v13
 ; RV32-NEXT:    vsrl.vi v13, v8, 8
@@ -1730,10 +1727,9 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64(<vscale x 2 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v12, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v14, v8, a5, v0.t
@@ -1880,9 +1876,7 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va
 ; RV32-NEXT:    vand.vx v18, v8, a1
 ; RV32-NEXT:    vand.vx v16, v16, a1
 ; RV32-NEXT:    vor.vv v10, v16, v10
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v18, v18, a4
 ; RV32-NEXT:    vor.vv v12, v12, v18
 ; RV32-NEXT:    vsrl.vi v18, v8, 8
@@ -2015,10 +2009,9 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
@@ -2165,9 +2158,7 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va
 ; RV32-NEXT:    vand.vx v28, v8, a1
 ; RV32-NEXT:    vand.vx v24, v24, a1
 ; RV32-NEXT:    vor.vv v12, v24, v12
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v28, v28, a4
 ; RV32-NEXT:    vor.vv v16, v16, v28
 ; RV32-NEXT:    vsrl.vi v28, v8, 8
@@ -2315,7 +2306,6 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
@@ -2323,7 +2313,6 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
@@ -2528,9 +2517,7 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
@@ -2704,7 +2691,6 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
@@ -2712,7 +2698,6 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
@@ -2917,9 +2902,7 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0dc1d0c..0c58cca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -523,11 +523,9 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v11, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
@@ -538,7 +536,7 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
@@ -609,15 +607,13 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v10, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v11, v8, a2
 ; RV32-NEXT:    vsrl.vx v12, v8, a4
-; RV32-NEXT:    vand.vx v13, v8, a1
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vand.vx v13, v8, a0
+; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v13, v13, a4
 ; RV32-NEXT:    vor.vv v10, v10, v13
 ; RV32-NEXT:    vsrl.vi v13, v8, 8
@@ -695,11 +691,9 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v12, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v14, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
 ; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
@@ -710,7 +704,7 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
@@ -781,15 +775,13 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v12, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v14, v8, a2
 ; RV32-NEXT:    vsrl.vx v16, v8, a4
-; RV32-NEXT:    vand.vx v18, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vand.vx v18, v8, a0
+; RV32-NEXT:    vand.vx v16, v16, a0
 ; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v18, v18, a4
 ; RV32-NEXT:    vor.vv v12, v12, v18
 ; RV32-NEXT:    vsrl.vi v18, v8, 8
@@ -867,11 +859,9 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v20, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
@@ -882,7 +872,7 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
@@ -953,15 +943,13 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v16, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v20, v8, a2
 ; RV32-NEXT:    vsrl.vx v24, v8, a4
-; RV32-NEXT:    vand.vx v28, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vand.vx v28, v8, a0
+; RV32-NEXT:    vand.vx v24, v24, a0
 ; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v28, v28, a4
 ; RV32-NEXT:    vor.vv v16, v16, v28
 ; RV32-NEXT:    vsrl.vi v28, v8, 8
@@ -1043,51 +1031,49 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1193,24 +1179,22 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a0
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a0
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v0, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
@@ -1221,7 +1205,6 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1318,51 +1301,49 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1468,24 +1449,22 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a0
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a0
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v0, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
@@ -1496,7 +1475,6 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1716,11 +1694,9 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v11, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
@@ -1731,7 +1707,7 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll
new file mode 100644
index 0000000..6f1efb6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+define <2 x i64> @expanded_fixed_neg_abs64(<2 x i64> %x) {
+; CHECK-LABEL: expanded_fixed_neg_abs64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vmin.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %t = sub <2 x i64> <i64 0, i64 0>, %x
+  %t1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %t, <2 x i64> %x)
+  %t2 = sub <2 x i64> <i64 0, i64 0>, %t1
+  ret <2 x i64> %t2
+}
+
+define <2 x i64> @expanded_fixed_neg_abs64_unsigned(<2 x i64> %x) {
+; CHECK-LABEL: expanded_fixed_neg_abs64_unsigned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %t = sub <2 x i64> <i64 0, i64 0>, %x
+  %t1 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %t, <2 x i64> %x)
+  %t2 = sub <2 x i64> <i64 0, i64 0>, %t1
+  ret <2 x i64> %t2
+}
+
+define <2 x i64> @expanded_fixed_neg_inv_abs64(<2 x i64> %x) {
+; CHECK-LABEL: expanded_fixed_neg_inv_abs64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vmax.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %t = sub <2 x i64> <i64 0, i64 0>, %x
+  %t1 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %t, <2 x i64> %x)
+  %t2 = sub <2 x i64> <i64 0, i64 0>, %t1
+  ret <2 x i64> %t2
+}
+
+define <2 x i64> @expanded_fixed_neg_inv_abs64_unsigned(<2 x i64> %x) {
+; CHECK-LABEL: expanded_fixed_neg_inv_abs64_unsigned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vmaxu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %t = sub <2 x i64> <i64 0, i64 0>, %x
+  %t1 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %t, <2 x i64> %x)
+  %t2 = sub <2 x i64> <i64 0, i64 0>, %t1
+  ret <2 x i64> %t2
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 15793ea..66952ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -1254,12 +1254,10 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) {
 define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_neg_trunc_v3f16_v3f32:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vle32.v v8, (a1)
 ; ZVFH-NEXT:    vle16.v v9, (a0)
-; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFH-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfsgnjn.vv v8, v9, v10
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -1272,9 +1270,7 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    addi a2, a1, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
@@ -4013,9 +4009,10 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4197,10 +4194,11 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 3
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4388,10 +4386,11 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 2
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4579,10 +4578,11 @@ define void @round_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 4
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 59c7feb..80e462c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1142,9 +1142,7 @@ define void @mulhu_v6i16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI67_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI67_0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 4f0f5dd..bf8baaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -530,7 +530,7 @@ define i32 @reduce_and_16xi32_prefix5(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, -1
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredand.vs v8, v8, v10
@@ -725,7 +725,7 @@ define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, -1
 ; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV32-NEXT:    vredminu.vs v8, v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 4e06d00..bb05eb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -168,11 +168,12 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64>
 define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_slide_two_source:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v13, v10, 1
-; CHECK-NEXT:    vslideup.vi v13, v11, 1
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v12, v8, 0
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vslideup.vi v12, v10, 1, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 5, i32 6>
   ret <4 x i64> %res
@@ -182,17 +183,18 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a0, a0, 252
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v11, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 5
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vsrl.vi v10, v10, 1
-; CHECK-NEXT:    vadd.vi v10, v10, 1
-; CHECK-NEXT:    vrgather.vv v9, v11, v10, v0.t
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    li a0, 175
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    vrgather.vv v11, v9, v8
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    addi a0, a1, 672
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -209,15 +211,15 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    li a0, -97
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vrsub.vi v9, v9, 4
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgather.vv v13, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v12, v8
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v13
-; CHECK-NEXT:    vadd.vv v13, v13, v13
-; CHECK-NEXT:    vmv.v.i v0, 6
-; CHECK-NEXT:    vrsub.vi v13, v13, 4
-; CHECK-NEXT:    vrgather.vv v9, v12, v13, v0.t
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %b = extractelement <4 x float> %a, i32 2
   %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
@@ -229,15 +231,16 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vscale_range(2,2) {
 ; RV32-LABEL: extract_any_extend_vector_inreg_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
 ; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; RV32-NEXT:    vmv.v.i v0, 1
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vrgather.vi v18, v15, 1, v0.t
-; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT:    vrgather.vi v16, v8, 15, v0.t
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vx v8, v16, a0
 ; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -255,14 +258,13 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 1
+; RV64-NEXT:    li a1, -17
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; RV64-NEXT:    vrgather.vi v18, v15, 1, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vrgather.vi v16, v8, 15
+; RV64-NEXT:    vmerge.vim v8, v16, 0, v0
 ; RV64-NEXT:    mv s2, sp
-; RV64-NEXT:    vs8r.v v16, (s2)
+; RV64-NEXT:    vs8r.v v8, (s2)
 ; RV64-NEXT:    andi a0, a0, 15
 ; RV64-NEXT:    li a1, 8
 ; RV64-NEXT:    call __muldi3
@@ -288,16 +290,21 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range(2,2) {
 ; CHECK-LABEL: shuffles_add:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT:    vmv1r.v v13, v10
-; CHECK-NEXT:    vslideup.vi v13, v11, 1
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    vmv.v.i v0, 1
-; CHECK-NEXT:    vrgather.vi v12, v9, 0
-; CHECK-NEXT:    vmv1r.v v9, v11
-; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v12, v8
+; CHECK-NEXT:    vrgather.vi v12, v8, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgather.vi v16, v8, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v14, v14
+; CHECK-NEXT:    vadd.vi v9, v8, -4
+; CHECK-NEXT:    vadd.vi v8, v8, -3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vrgatherei16.vv v12, v10, v9, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v16, v10, v8, v0.t
+; CHECK-NEXT:    vfadd.vv v8, v12, v16
 ; CHECK-NEXT:    ret
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
@@ -305,3 +312,91 @@ define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range
   ret <4 x double> %5
 }
 
+define <16 x i32> @m4_square_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) {
+; CHECK-LABEL: m4_square_num_of_shuffles_in_chunks:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI17_0)
+; CHECK-NEXT:    vl1r.v v12, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v16, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> <i32 0, i32 5, i32 8, i32 12, i32 1, i32 4, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) {
+; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI18_0)
+; CHECK-NEXT:    vl2re16.v v16, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 8, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 11, i32 poison>
+  ret <16 x i32> %1
+}
+
+define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) {
+; RV32-LABEL: multi_chunks_shuffle:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v12, v10, a0
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vsrl.vv v12, v8, v12
+; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vmerge.vvm v8, v10, v8, v0
+; RV32-NEXT:    vrgather.vi v10, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    srai a1, a0, 31
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: multi_chunks_shuffle:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetivli zero, 16, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    addi a0, a0, -241
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
+; RV64-NEXT:    vrgather.vi v10, v8, 2
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+entry:
+  %1 = shufflevector <32 x i32> %0, <32 x i32> zeroinitializer, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 37, i32 36, i32 39, i32 38, i32 9, i32 8, i32 11, i32 10, i32 45, i32 44, i32 47, i32 46, i32 17, i32 16, i32 19, i32 18, i32 53, i32 52, i32 55, i32 54, i32 25, i32 24, i32 27, i32 26, i32 61, i32 60, i32 63, i32 62>
+  %2 = shufflevector <32 x i32> zeroinitializer, <32 x i32> %1, <32 x i32> <i32 3, i32 34, i32 33, i32 0, i32 7, i32 38, i32 37, i32 4, i32 11, i32 42, i32 41, i32 8, i32 15, i32 46, i32 45, i32 12, i32 19, i32 50, i32 49, i32 16, i32 23, i32 54, i32 53, i32 20, i32 27, i32 58, i32 57, i32 24, i32 31, i32 62, i32 61, i32 28>
+  %3 = or <32 x i32> %1, %2
+  %4 = extractelement <32 x i32> %3, i64 1
+  %conv199 = sext i32 %4 to i64
+  ret i64 %conv199
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index 5d407ca..05254e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -473,6 +473,7 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c)
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 7649d60..33fe73a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -582,14 +582,14 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -616,13 +616,13 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -652,14 +652,14 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -686,13 +686,13 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -722,15 +722,15 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
@@ -758,13 +758,13 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
@@ -796,15 +796,15 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
@@ -832,13 +832,13 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
@@ -876,15 +876,15 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -927,10 +927,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
@@ -995,64 +995,62 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    li a5, 24
+; ZVFHMIN-NEXT:    mul a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1070,35 +1068,43 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
@@ -1110,7 +1116,7 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    li a1, 25
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
@@ -1152,68 +1158,61 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1221,7 +1220,8 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1229,43 +1229,49 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 8e448fc..c65712e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -582,14 +582,14 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -616,13 +616,13 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -652,14 +652,14 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -686,13 +686,13 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -722,15 +722,15 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
@@ -758,13 +758,13 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
@@ -796,15 +796,15 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
@@ -832,13 +832,13 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
@@ -876,15 +876,15 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -927,10 +927,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
@@ -995,64 +995,62 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    li a5, 24
+; ZVFHMIN-NEXT:    mul a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1070,35 +1068,43 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
@@ -1110,7 +1116,7 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    li a1, 25
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
@@ -1152,68 +1158,61 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1221,7 +1220,8 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1229,43 +1229,49 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
index 2fda344..6787c8c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
@@ -18,7 +18,7 @@ entry:
 define i64 @reduce_add2(<4 x i64> %v) {
 ; CHECK-LABEL: reduce_add2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 8
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 70b5384..06f4876 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1658,10 +1658,10 @@ define <vscale x 1 x i1> @fcmp_oeq_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"oeq", <vscale x 1 x i1> %m, i32 %evl)
@@ -1678,11 +1678,11 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1701,11 +1701,11 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1723,10 +1723,10 @@ define <vscale x 1 x i1> @fcmp_ogt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ogt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"ogt", <vscale x 1 x i1> %m, i32 %evl)
@@ -1743,11 +1743,11 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1766,11 +1766,11 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1788,10 +1788,10 @@ define <vscale x 1 x i1> @fcmp_oge_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oge_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"oge", <vscale x 1 x i1> %m, i32 %evl)
@@ -1808,11 +1808,11 @@ define <vscale x 1 x i1> @fcmp_oge_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oge_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1831,11 +1831,11 @@ define <vscale x 1 x i1> @fcmp_oge_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oge_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1853,10 +1853,10 @@ define <vscale x 1 x i1> @fcmp_olt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_olt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"olt", <vscale x 1 x i1> %m, i32 %evl)
@@ -1873,11 +1873,11 @@ define <vscale x 1 x i1> @fcmp_olt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_olt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1896,11 +1896,11 @@ define <vscale x 1 x i1> @fcmp_olt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_olt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1918,10 +1918,10 @@ define <vscale x 1 x i1> @fcmp_ole_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ole_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"ole", <vscale x 1 x i1> %m, i32 %evl)
@@ -1938,11 +1938,11 @@ define <vscale x 1 x i1> @fcmp_ole_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ole_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1961,11 +1961,11 @@ define <vscale x 1 x i1> @fcmp_ole_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ole_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1985,10 +1985,10 @@ define <vscale x 1 x i1> @fcmp_one_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_one_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2009,11 +2009,11 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_one_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2036,11 +2036,11 @@ define <vscale x 1 x i1> @fcmp_one_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_one_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2062,10 +2062,10 @@ define <vscale x 1 x i1> @fcmp_ord_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ord_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
@@ -2088,14 +2088,14 @@ define <vscale x 1 x i1> @fcmp_ord_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ord_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2119,14 +2119,14 @@ define <vscale x 1 x i1> @fcmp_ord_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ord_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -2147,10 +2147,10 @@ define <vscale x 1 x i1> @fcmp_ueq_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ueq_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2171,11 +2171,11 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2198,11 +2198,11 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2223,10 +2223,10 @@ define <vscale x 1 x i1> @fcmp_ugt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ugt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2245,11 +2245,11 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2270,11 +2270,11 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2294,10 +2294,10 @@ define <vscale x 1 x i1> @fcmp_uge_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uge_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2316,11 +2316,11 @@ define <vscale x 1 x i1> @fcmp_uge_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uge_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2341,11 +2341,11 @@ define <vscale x 1 x i1> @fcmp_uge_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uge_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2365,10 +2365,10 @@ define <vscale x 1 x i1> @fcmp_ult_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ult_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2387,11 +2387,11 @@ define <vscale x 1 x i1> @fcmp_ult_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ult_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2412,11 +2412,11 @@ define <vscale x 1 x i1> @fcmp_ult_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ult_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2436,10 +2436,10 @@ define <vscale x 1 x i1> @fcmp_ule_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ule_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2458,11 +2458,11 @@ define <vscale x 1 x i1> @fcmp_ule_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ule_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2483,11 +2483,11 @@ define <vscale x 1 x i1> @fcmp_ule_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ule_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2506,10 +2506,10 @@ define <vscale x 1 x i1> @fcmp_une_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_une_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"une", <vscale x 1 x i1> %m, i32 %evl)
@@ -2526,11 +2526,11 @@ define <vscale x 1 x i1> @fcmp_une_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_une_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -2549,11 +2549,11 @@ define <vscale x 1 x i1> @fcmp_une_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_une_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -2573,10 +2573,10 @@ define <vscale x 1 x i1> @fcmp_uno_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uno_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2599,14 +2599,14 @@ define <vscale x 1 x i1> @fcmp_uno_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uno_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2630,14 +2630,14 @@ define <vscale x 1 x i1> @fcmp_uno_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uno_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -2658,10 +2658,10 @@ define <vscale x 3 x i1> @fcmp_oeq_vv_nxv3f16(<vscale x 3 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv3f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2681,10 +2681,10 @@ define <vscale x 8 x i1> @fcmp_oeq_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2703,11 +2703,11 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2728,11 +2728,11 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2752,10 +2752,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ogt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2774,11 +2774,11 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2799,11 +2799,11 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2823,10 +2823,10 @@ define <vscale x 8 x i1> @fcmp_oge_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oge_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2845,11 +2845,11 @@ define <vscale x 8 x i1> @fcmp_oge_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oge_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2870,11 +2870,11 @@ define <vscale x 8 x i1> @fcmp_oge_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oge_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2894,10 +2894,10 @@ define <vscale x 8 x i1> @fcmp_olt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_olt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2916,11 +2916,11 @@ define <vscale x 8 x i1> @fcmp_olt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_olt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2941,11 +2941,11 @@ define <vscale x 8 x i1> @fcmp_olt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_olt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2965,10 +2965,10 @@ define <vscale x 8 x i1> @fcmp_ole_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ole_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2987,11 +2987,11 @@ define <vscale x 8 x i1> @fcmp_ole_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ole_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3012,11 +3012,11 @@ define <vscale x 8 x i1> @fcmp_ole_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ole_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3037,10 +3037,10 @@ define <vscale x 8 x i1> @fcmp_one_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_one_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3061,11 +3061,11 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_one_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3088,11 +3088,11 @@ define <vscale x 8 x i1> @fcmp_one_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_one_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3114,13 +3114,13 @@ define <vscale x 8 x i1> @fcmp_ord_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ord_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3142,14 +3142,14 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ord_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3173,14 +3173,14 @@ define <vscale x 8 x i1> @fcmp_ord_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ord_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3201,10 +3201,10 @@ define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ueq_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3225,11 +3225,11 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3252,11 +3252,11 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3277,10 +3277,10 @@ define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ugt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3299,11 +3299,11 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3324,11 +3324,11 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3348,10 +3348,10 @@ define <vscale x 8 x i1> @fcmp_uge_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uge_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3370,11 +3370,11 @@ define <vscale x 8 x i1> @fcmp_uge_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uge_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3395,11 +3395,11 @@ define <vscale x 8 x i1> @fcmp_uge_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uge_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3419,10 +3419,10 @@ define <vscale x 8 x i1> @fcmp_ult_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ult_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3441,11 +3441,11 @@ define <vscale x 8 x i1> @fcmp_ult_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ult_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3466,11 +3466,11 @@ define <vscale x 8 x i1> @fcmp_ult_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ult_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3490,10 +3490,10 @@ define <vscale x 8 x i1> @fcmp_ule_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ule_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3512,11 +3512,11 @@ define <vscale x 8 x i1> @fcmp_ule_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ule_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3537,11 +3537,11 @@ define <vscale x 8 x i1> @fcmp_ule_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ule_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3561,10 +3561,10 @@ define <vscale x 8 x i1> @fcmp_une_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_une_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3583,11 +3583,11 @@ define <vscale x 8 x i1> @fcmp_une_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_une_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3608,11 +3608,11 @@ define <vscale x 8 x i1> @fcmp_une_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_une_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3633,13 +3633,13 @@ define <vscale x 8 x i1> @fcmp_uno_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uno_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3661,14 +3661,14 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uno_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3692,14 +3692,14 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uno_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3829,14 +3829,14 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli t0, a0, 4
-; ZVFHMIN-NEXT:    add a0, t0, a0
+; ZVFHMIN-NEXT:    slli a7, a0, 4
+; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -3844,7 +3844,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB171_2
 ; ZVFHMIN-NEXT:  # %bb.1:
@@ -3857,16 +3857,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a7, a0, 4
-; ZVFHMIN-NEXT:    add a0, a7, a0
+; ZVFHMIN-NEXT:    slli a6, a0, 4
+; ZVFHMIN-NEXT:    add a0, a6, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
@@ -3881,6 +3881,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a6, a2, a5
+; ZVFHMIN-NEXT:    addi a6, a6, -1
+; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
 ; ZVFHMIN-NEXT:    mv a7, a6
 ; ZVFHMIN-NEXT:    slli a6, a6, 3
@@ -3890,31 +3893,28 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 4
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 5
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 4
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 5
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    sltu a6, a2, a5
-; ZVFHMIN-NEXT:    addi a6, a6, -1
-; ZVFHMIN-NEXT:    and a5, a6, a5
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 4
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 4
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v4, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v5, v6, a3
@@ -3923,16 +3923,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a5, a4, 5
-; ZVFHMIN-NEXT:    add a4, a5, a4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a4, a2, 5
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 61cc754..9c733b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2367,9 +2367,8 @@ define <vscale x 1 x i1> @icmp_eq_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmseq.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2394,9 +2393,8 @@ define <vscale x 1 x i1> @icmp_eq_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmseq.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2451,9 +2449,8 @@ define <vscale x 1 x i1> @icmp_ne_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsne.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2478,9 +2475,8 @@ define <vscale x 1 x i1> @icmp_ne_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsne.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2535,9 +2531,8 @@ define <vscale x 1 x i1> @icmp_ugt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2562,9 +2557,8 @@ define <vscale x 1 x i1> @icmp_ugt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2619,9 +2613,8 @@ define <vscale x 1 x i1> @icmp_uge_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2647,9 +2640,8 @@ define <vscale x 1 x i1> @icmp_uge_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2704,9 +2696,8 @@ define <vscale x 1 x i1> @icmp_ult_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2731,9 +2722,8 @@ define <vscale x 1 x i1> @icmp_ult_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2788,9 +2778,8 @@ define <vscale x 1 x i1> @icmp_sgt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2815,9 +2804,8 @@ define <vscale x 1 x i1> @icmp_sgt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2872,9 +2860,8 @@ define <vscale x 1 x i1> @icmp_sge_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2900,9 +2887,8 @@ define <vscale x 1 x i1> @icmp_sge_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2957,9 +2943,8 @@ define <vscale x 1 x i1> @icmp_slt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2984,9 +2969,8 @@ define <vscale x 1 x i1> @icmp_slt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3041,9 +3025,8 @@ define <vscale x 1 x i1> @icmp_sle_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3068,9 +3051,8 @@ define <vscale x 1 x i1> @icmp_sle_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3129,9 +3111,8 @@ define <vscale x 8 x i1> @icmp_eq_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3158,9 +3139,8 @@ define <vscale x 8 x i1> @icmp_eq_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmseq.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3220,9 +3200,8 @@ define <vscale x 8 x i1> @icmp_ne_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsne.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3249,9 +3228,8 @@ define <vscale x 8 x i1> @icmp_ne_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsne.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3311,9 +3289,8 @@ define <vscale x 8 x i1> @icmp_ugt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3340,9 +3317,8 @@ define <vscale x 8 x i1> @icmp_ugt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3402,9 +3378,8 @@ define <vscale x 8 x i1> @icmp_uge_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3432,9 +3407,8 @@ define <vscale x 8 x i1> @icmp_uge_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3494,9 +3468,8 @@ define <vscale x 8 x i1> @icmp_ult_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3523,9 +3496,8 @@ define <vscale x 8 x i1> @icmp_ult_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3585,9 +3557,8 @@ define <vscale x 8 x i1> @icmp_sgt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3614,9 +3585,8 @@ define <vscale x 8 x i1> @icmp_sgt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3676,9 +3646,8 @@ define <vscale x 8 x i1> @icmp_sge_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3706,9 +3675,8 @@ define <vscale x 8 x i1> @icmp_sge_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3768,9 +3736,8 @@ define <vscale x 8 x i1> @icmp_slt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3797,9 +3764,8 @@ define <vscale x 8 x i1> @icmp_slt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3859,9 +3825,8 @@ define <vscale x 8 x i1> @icmp_sle_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3888,9 +3853,8 @@ define <vscale x 8 x i1> @icmp_sle_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index fee6799..77f3cf3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -1487,9 +1487,8 @@ define <vscale x 1 x i64> @vadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1514,9 +1513,8 @@ define <vscale x 1 x i64> @vadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1583,9 +1581,8 @@ define <vscale x 2 x i64> @vadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1610,9 +1607,8 @@ define <vscale x 2 x i64> @vadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1679,9 +1675,8 @@ define <vscale x 4 x i64> @vadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1706,9 +1701,8 @@ define <vscale x 4 x i64> @vadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1775,9 +1769,8 @@ define <vscale x 8 x i64> @vadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1802,9 +1795,8 @@ define <vscale x 8 x i64> @vadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
index b0c5a72..4866bb0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
@@ -1314,9 +1314,8 @@ define <vscale x 1 x i64> @vand_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1341,9 +1340,8 @@ define <vscale x 1 x i64> @vand_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1410,9 +1408,8 @@ define <vscale x 2 x i64> @vand_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1437,9 +1434,8 @@ define <vscale x 2 x i64> @vand_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1506,9 +1502,8 @@ define <vscale x 4 x i64> @vand_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1533,9 +1528,8 @@ define <vscale x 4 x i64> @vand_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1602,9 +1596,8 @@ define <vscale x 8 x i64> @vand_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1629,9 +1622,8 @@ define <vscale x 8 x i64> @vand_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
index 32992301..763b290 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
@@ -1115,9 +1115,8 @@ define <vscale x 1 x i64> @vandn_vx_vp_nxv1i64(i64 %a, <vscale x 1 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1139,9 +1138,8 @@ define <vscale x 1 x i64> @vandn_vx_vp_nxv1i64(i64 %a, <vscale x 1 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1208,9 +1206,8 @@ define <vscale x 2 x i64> @vandn_vx_vp_nxv2i64(i64 %a, <vscale x 2 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1232,9 +1229,8 @@ define <vscale x 2 x i64> @vandn_vx_vp_nxv2i64(i64 %a, <vscale x 2 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1301,9 +1297,8 @@ define <vscale x 4 x i64> @vandn_vx_vp_nxv4i64(i64 %a, <vscale x 4 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1325,9 +1320,8 @@ define <vscale x 4 x i64> @vandn_vx_vp_nxv4i64(i64 %a, <vscale x 4 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1394,9 +1388,8 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1418,9 +1411,8 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
index 2814be2..03e4e1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
@@ -893,9 +893,8 @@ define <vscale x 1 x i64> @vdiv_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -920,9 +919,8 @@ define <vscale x 1 x i64> @vdiv_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -969,9 +967,8 @@ define <vscale x 2 x i64> @vdiv_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -996,9 +993,8 @@ define <vscale x 2 x i64> @vdiv_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1045,9 +1041,8 @@ define <vscale x 4 x i64> @vdiv_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1072,9 +1067,8 @@ define <vscale x 4 x i64> @vdiv_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1121,9 +1115,8 @@ define <vscale x 8 x i64> @vdiv_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1148,9 +1141,8 @@ define <vscale x 8 x i64> @vdiv_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
index 3e913d4..2f35f91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
@@ -892,9 +892,8 @@ define <vscale x 1 x i64> @vdivu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -919,9 +918,8 @@ define <vscale x 1 x i64> @vdivu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -968,9 +966,8 @@ define <vscale x 2 x i64> @vdivu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -995,9 +992,8 @@ define <vscale x 2 x i64> @vdivu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1044,9 +1040,8 @@ define <vscale x 4 x i64> @vdivu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1071,9 +1066,8 @@ define <vscale x 4 x i64> @vdivu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1120,9 +1114,8 @@ define <vscale x 8 x i64> @vdivu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1147,9 +1140,8 @@ define <vscale x 8 x i64> @vdivu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 87bc9f2..31359c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -679,10 +679,10 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -700,10 +700,10 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -722,11 +722,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -747,11 +747,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_commute(<vscale x 1 x half> %va, ha
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -772,11 +772,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -797,11 +797,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x half
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -823,10 +823,10 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -844,10 +844,10 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -866,11 +866,11 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -891,11 +891,11 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -917,10 +917,10 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -938,10 +938,10 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -960,11 +960,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -985,11 +985,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -1011,10 +1011,10 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1032,10 +1032,10 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1054,11 +1054,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1079,11 +1079,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1105,10 +1105,10 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1126,10 +1126,10 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1148,11 +1148,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfadd_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1173,11 +1173,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfadd_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1205,23 +1205,22 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1231,10 +1230,11 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB48_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1266,22 +1266,21 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1291,9 +1290,10 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB49_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1324,14 +1324,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1352,15 +1348,18 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1370,20 +1369,21 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB50_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1416,16 +1416,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1436,18 +1430,22 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1457,14 +1455,15 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB51_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index 061af45..2205769 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -641,10 +641,10 @@ define <vscale x 1 x half> @vfdiv_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -662,10 +662,10 @@ define <vscale x 1 x half> @vfdiv_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -684,11 +684,11 @@ define <vscale x 1 x half> @vfdiv_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -709,11 +709,11 @@ define <vscale x 1 x half> @vfdiv_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -735,10 +735,10 @@ define <vscale x 2 x half> @vfdiv_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -756,10 +756,10 @@ define <vscale x 2 x half> @vfdiv_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -778,11 +778,11 @@ define <vscale x 2 x half> @vfdiv_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -803,11 +803,11 @@ define <vscale x 2 x half> @vfdiv_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -829,10 +829,10 @@ define <vscale x 4 x half> @vfdiv_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -850,10 +850,10 @@ define <vscale x 4 x half> @vfdiv_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -872,11 +872,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -897,11 +897,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -923,10 +923,10 @@ define <vscale x 8 x half> @vfdiv_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -944,10 +944,10 @@ define <vscale x 8 x half> @vfdiv_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -966,11 +966,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -991,11 +991,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1017,10 +1017,10 @@ define <vscale x 16 x half> @vfdiv_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1038,10 +1038,10 @@ define <vscale x 16 x half> @vfdiv_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1060,11 +1060,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1085,11 +1085,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1117,23 +1117,22 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1143,10 +1142,11 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB44_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1178,22 +1178,21 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1203,9 +1202,10 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB45_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1236,14 +1236,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1264,15 +1260,18 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1282,20 +1281,21 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB46_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1328,16 +1328,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1348,18 +1342,22 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1369,14 +1367,15 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB47_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 02d6229..5d998c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -290,10 +290,10 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -311,10 +311,10 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -334,10 +334,10 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -355,10 +355,10 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -378,10 +378,10 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -399,10 +399,10 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -422,10 +422,10 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -443,10 +443,10 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -466,10 +466,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -487,10 +487,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -516,23 +516,22 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -542,10 +541,11 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -577,22 +577,21 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -602,9 +601,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index f7f8029..48a4c13 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -290,10 +290,10 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -311,10 +311,10 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -334,10 +334,10 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -355,10 +355,10 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -378,10 +378,10 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -399,10 +399,10 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -422,10 +422,10 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -443,10 +443,10 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -466,10 +466,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -487,10 +487,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -516,23 +516,22 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -542,10 +541,11 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -577,22 +577,21 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -602,9 +601,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index 7e552304..06f74dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x half> @vfmul_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -40,10 +40,10 @@ define <vscale x 1 x half> @vfmul_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -62,11 +62,11 @@ define <vscale x 1 x half> @vfmul_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -87,11 +87,11 @@ define <vscale x 1 x half> @vfmul_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -113,10 +113,10 @@ define <vscale x 2 x half> @vfmul_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -134,10 +134,10 @@ define <vscale x 2 x half> @vfmul_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -156,11 +156,11 @@ define <vscale x 2 x half> @vfmul_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -181,11 +181,11 @@ define <vscale x 2 x half> @vfmul_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -207,10 +207,10 @@ define <vscale x 4 x half> @vfmul_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -228,10 +228,10 @@ define <vscale x 4 x half> @vfmul_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -250,11 +250,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -275,11 +275,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -301,10 +301,10 @@ define <vscale x 8 x half> @vfmul_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -322,10 +322,10 @@ define <vscale x 8 x half> @vfmul_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -344,11 +344,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -369,11 +369,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -395,10 +395,10 @@ define <vscale x 16 x half> @vfmul_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -416,10 +416,10 @@ define <vscale x 16 x half> @vfmul_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -438,11 +438,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfmul_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -463,11 +463,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfmul_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -495,23 +495,22 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -521,10 +520,11 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB20_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -556,22 +556,21 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -581,9 +580,10 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB21_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -614,14 +614,10 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -642,15 +638,18 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -660,20 +659,21 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -706,16 +706,10 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -726,18 +720,22 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -747,14 +745,15 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
index b7f2133..575d50d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
@@ -42,9 +42,9 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i1_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -62,9 +62,9 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i1_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index d990c74..e33ab98 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -130,9 +130,8 @@ define <vscale x 2 x i7> @vfptosi_v4i7_v4f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfptosi_v4i7_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -153,9 +152,8 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i8_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -174,9 +172,8 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i8_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
@@ -196,9 +193,8 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i16_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -214,9 +210,8 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i16_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -235,9 +230,9 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i32_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -254,9 +249,9 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i32_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -276,9 +271,9 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i64_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -286,21 +281,13 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 }
 
 define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
-; ZVFH-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v8, v10
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.rtz.x.f.v v8, v10
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
+; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
index 8ac5992..e1d0ad4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
@@ -42,9 +42,9 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i1_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -62,9 +62,9 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i1_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 3b24a64..86222ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -130,9 +130,8 @@ define <vscale x 2 x i7> @vfptoui_v4i7_v4f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfptoui_v4i7_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -153,9 +152,8 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i8_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -174,9 +172,8 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i8_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
@@ -196,9 +193,8 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i16_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -214,9 +210,8 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i16_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -235,9 +230,9 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i32_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -254,9 +249,9 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i32_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -276,9 +271,9 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i64_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.xu.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -286,21 +281,13 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 }
 
 define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
-; ZVFH-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
+; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 8e57be1..e94d0a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -242,9 +242,9 @@ define <vscale x 1 x half> @vfsqrt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -262,9 +262,9 @@ define <vscale x 1 x half> @vfsqrt_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -284,9 +284,9 @@ define <vscale x 2 x half> @vfsqrt_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -304,9 +304,9 @@ define <vscale x 2 x half> @vfsqrt_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -326,9 +326,9 @@ define <vscale x 4 x half> @vfsqrt_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -346,9 +346,9 @@ define <vscale x 4 x half> @vfsqrt_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -368,9 +368,9 @@ define <vscale x 8 x half> @vfsqrt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -388,9 +388,9 @@ define <vscale x 8 x half> @vfsqrt_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -410,9 +410,9 @@ define <vscale x 16 x half> @vfsqrt_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -430,9 +430,9 @@ define <vscale x 16 x half> @vfsqrt_vv_nxv16f16_unmasked(<vscale x 16 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -458,13 +458,13 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
@@ -472,9 +472,10 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -498,14 +499,14 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -513,8 +514,9 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index d034f65..56ed560 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -641,10 +641,10 @@ define <vscale x 1 x half> @vfsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -662,10 +662,10 @@ define <vscale x 1 x half> @vfsub_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -684,11 +684,11 @@ define <vscale x 1 x half> @vfsub_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -709,11 +709,11 @@ define <vscale x 1 x half> @vfsub_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -735,10 +735,10 @@ define <vscale x 2 x half> @vfsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -756,10 +756,10 @@ define <vscale x 2 x half> @vfsub_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -778,11 +778,11 @@ define <vscale x 2 x half> @vfsub_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -803,11 +803,11 @@ define <vscale x 2 x half> @vfsub_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -829,10 +829,10 @@ define <vscale x 4 x half> @vfsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -850,10 +850,10 @@ define <vscale x 4 x half> @vfsub_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -872,11 +872,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -897,11 +897,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -923,10 +923,10 @@ define <vscale x 8 x half> @vfsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -944,10 +944,10 @@ define <vscale x 8 x half> @vfsub_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -966,11 +966,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -991,11 +991,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1017,10 +1017,10 @@ define <vscale x 16 x half> @vfsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1038,10 +1038,10 @@ define <vscale x 16 x half> @vfsub_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1060,11 +1060,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1085,11 +1085,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfsub_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1117,23 +1117,22 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1143,10 +1142,11 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB44_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1178,22 +1178,21 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1203,9 +1202,10 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB45_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1236,14 +1236,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1264,15 +1260,18 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1282,20 +1281,21 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB46_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1328,16 +1328,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1348,18 +1342,22 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1369,14 +1367,15 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB47_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
index 6cd3884..a1d548e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
@@ -143,9 +143,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -170,9 +169,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_commute(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -198,9 +196,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -225,9 +222,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_tu(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_tu:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
@@ -254,9 +250,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_commute_tu(<vscale x 1 x half> %v
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_commute_tu:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
@@ -283,9 +278,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_unmasked_tu(<vscale x 1 x half> %
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_unmasked_tu:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
@@ -362,9 +356,8 @@ define <vscale x 2 x float> @vfmacc_vf_nxv2f32(<vscale x 2 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv2f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -389,9 +382,8 @@ define <vscale x 2 x float> @vfmacc_vf_nxv2f32_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv2f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -468,9 +460,8 @@ define <vscale x 4 x float> @vfmacc_vf_nxv4f32(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -495,9 +486,8 @@ define <vscale x 4 x float> @vfmacc_vf_nxv4f32_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -574,9 +564,8 @@ define <vscale x 8 x float> @vfmacc_vf_nxv8f32(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -601,9 +590,8 @@ define <vscale x 8 x float> @vfmacc_vf_nxv8f32_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -696,9 +684,8 @@ define <vscale x 16 x float> @vfmacc_vf_nxv16f32(<vscale x 16 x half> %va, half
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -723,9 +710,8 @@ define <vscale x 16 x float> @vfmacc_vf_nxv16f32_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
index c92a79e..94b8007 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
@@ -120,9 +120,8 @@ define <vscale x 1 x float> @vmfsac_vf_nxv1f32(<vscale x 1 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -148,9 +147,8 @@ define <vscale x 1 x float> @vmfsac_vf_nxv1f32_commute(<vscale x 1 x half> %a, h
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -177,9 +175,8 @@ define <vscale x 1 x float> @vmfsac_vf_nxv1f32_unmasked(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -255,9 +252,8 @@ define <vscale x 2 x float> @vmfsac_vf_nxv2f32(<vscale x 2 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -283,9 +279,8 @@ define <vscale x 2 x float> @vmfsac_vf_nxv2f32_commute(<vscale x 2 x half> %a, h
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -312,9 +307,8 @@ define <vscale x 2 x float> @vmfsac_vf_nxv2f32_unmasked(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -392,9 +386,8 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -420,9 +413,8 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32_commute(<vscale x 4 x half> %a, h
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -449,9 +441,8 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -529,9 +520,8 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -557,9 +547,8 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32_commute(<vscale x 8 x half> %a, h
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -586,9 +575,8 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
index 0a0bc66..ea45706 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
@@ -71,9 +71,8 @@ define <vscale x 1 x float> @vfnmacc_vf_nxv1f32(<vscale x 1 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -101,9 +100,8 @@ define <vscale x 1 x float> @vfnmacc_vf_nxv1f32_commute(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -131,9 +129,8 @@ define <vscale x 1 x float> @vfnmacc_vf_nxv1f32_unmasked(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -212,9 +209,8 @@ define <vscale x 2 x float> @vfnmacc_vf_nxv2f32(<vscale x 2 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -242,9 +238,8 @@ define <vscale x 2 x float> @vfnmacc_vf_nxv2f32_commute(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -272,9 +267,8 @@ define <vscale x 2 x float> @vfnmacc_vf_nxv2f32_unmasked(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -355,9 +349,8 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -385,9 +378,8 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32_commute(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -415,9 +407,8 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -498,9 +489,8 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -528,9 +518,8 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32_commute(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -558,9 +547,8 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -657,9 +645,8 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32(<vscale x 16 x half> %a, half
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -687,9 +674,8 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32_commute(<vscale x 16 x half> %
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -717,9 +703,8 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
index b5f7ef3..4956da5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
@@ -69,9 +69,8 @@ define <vscale x 1 x float> @vfnmsac_vf_nxv1f32(<vscale x 1 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -97,9 +96,8 @@ define <vscale x 1 x float> @vfnmsac_vf_nxv1f32_commute(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -126,9 +124,8 @@ define <vscale x 1 x float> @vfnmsac_vf_nxv1f32_unmasked(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -204,9 +201,8 @@ define <vscale x 2 x float> @vfnmsac_vf_nxv2f32(<vscale x 2 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -232,9 +228,8 @@ define <vscale x 2 x float> @vfnmsac_vf_nxv2f32_commute(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -261,9 +256,8 @@ define <vscale x 2 x float> @vfnmsac_vf_nxv2f32_unmasked(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -341,9 +335,8 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -369,9 +362,8 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32_commute(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -398,9 +390,8 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -478,9 +469,8 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -506,9 +496,8 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32_commute(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -535,9 +524,8 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -631,9 +619,8 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32(<vscale x 16 x half> %a, half
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -659,9 +646,8 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32_commute(<vscale x 16 x half> %
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -688,9 +674,8 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index b304769b..c4a3834 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -513,6 +513,36 @@ body: |
     %y:vr = PseudoVNSRL_WV_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
 ...
 ---
+name: vfnop_vs2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfnop_vs2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0
+...
+---
+name: vfnop_vs2_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfnop_vs2_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 4 /* e16 */, 0
+...
+---
+name: vfnop_vs2_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfnop_vs2_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0
+...
+---
 name: vseN_v
 body: |
   bb.0:
@@ -543,6 +573,86 @@ body: |
     PseudoVSE8_V_MF2 %x, $noreg, 1, 3 /* e8 */
 ...
 ---
+name: vsm_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vsm_v
+    ; CHECK: %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, 1, 0 /* e8 */
+    ; CHECK-NEXT: PseudoVSM_V_B8 %x, $noreg, 1, 0 /* e8 */
+    %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0
+    PseudoVSM_V_B8 %x, $noreg, 1, 0
+...
+---
+name: vsm_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vsm_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0 /* e8 */
+    ; CHECK-NEXT: PseudoVSM_V_B16 %x, $noreg, 1, 0 /* e8 */
+    %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0
+    PseudoVSM_V_B16 %x, $noreg, 1, 0
+...
+---
+name: vleN_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vleN_v_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0
+...
+---
+name: vleN_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %x:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vlm_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, 1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B8 $noreg, %x, 1, 0 /* e8 */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVMAND_MM_B8 $noreg, %x, 1, 0
+...
+---
+name: vlm_v_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0
+...
+---
+name: vlm_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0 /* e8 */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0
+...
+---
 name: vsseN_v
 body: |
   bb.0:
@@ -675,6 +785,56 @@ body: |
     %y:vr = PseudoVLUXEI8_V_MF2_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0
 ...
 ---
+name: vluxeiN_v_idx_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_idx_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_idx_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_idx_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVLUXEI8_V_MF2_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVLUXEI8_V_MF2_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_vd
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_vd
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_vd_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_vd_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0
+...
+---
+name: vluxeiN_vd_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_vd_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
 name: vmop_mm
 body: |
   bb.0:
@@ -1064,3 +1224,116 @@ body: |
     %x:vr = PseudoVMAND_MM_B1 $noreg, $noreg, -1, 0
     %y:vr = PseudoVIOTA_M_MF2 $noreg, %x, 1, 3 /* e8 */, 0
 ...
+name: vred_vs2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1_vs2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1_vs2_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 4 /* e16 */, 0
+...
+---
+name: vred_vs1_vs2_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_MF2_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_MF2_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_other_user_is_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_other_user_is_vl0
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_both_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_both_vl0
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_vl0_and_vlreg
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vl0_and_vlreg
+    ; CHECK: %vl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, %vl, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %vl:gprnox0 = COPY $x1
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, %vl, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_vlreg_and_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vlreg_and_vl0
+    ; CHECK: %vl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0 /* tu, mu */
+    %vl:gprnox0 = COPY $x1
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0
+...
+---
+name: vred_other_user_is_vl2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_other_user_is_vl2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 2, 3 /* e8 */, 0
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
index 3f966b0..0a366f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
@@ -110,4 +110,24 @@ body: |
     %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
     %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, -1, 3 /* e8 */, 0
 ...
+---
+name: vfcvt_x_f_v_nofpexcept
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfcvt_x_f_v_nofpexcept
+    ; CHECK: %x:vr = nofpexcept PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = nofpexcept PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vfcvt_x_f_v_fpexcept
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfcvt_x_f_v_fpexcept
+    ; CHECK: %x:vr = PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir b/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir
new file mode 100644
index 0000000..e8f7957
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir
@@ -0,0 +1,13 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-vl-optimizer -verify-machineinstrs | FileCheck %s
+
+---
+name: vleN_v_volatile
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ :: (volatile load (<vscale x 1 x s64>))
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 :: (volatile load (<vscale x 1 x s64>))
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
index 333117c..c334e70 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
@@ -1654,9 +1654,9 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, mu
 ; RV32-NEXT:    vmacc.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1685,9 +1685,9 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64_unmasked(<vscale x 1 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV32-NEXT:    vmacc.vv v9, v8, v10
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1729,9 +1729,8 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmacc.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1791,9 +1790,9 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
 ; RV32-NEXT:    vmacc.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1822,9 +1821,9 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64_unmasked(<vscale x 2 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
 ; RV32-NEXT:    vmacc.vv v10, v8, v12
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1866,9 +1865,8 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmacc.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1928,9 +1926,9 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, mu
 ; RV32-NEXT:    vmacc.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1959,9 +1957,9 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64_unmasked(<vscale x 4 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
 ; RV32-NEXT:    vmacc.vv v12, v8, v16
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2003,9 +2001,8 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmacc.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2067,9 +2064,9 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, mu
 ; RV32-NEXT:    vmacc.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2098,9 +2095,9 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64_unmasked(<vscale x 8 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
 ; RV32-NEXT:    vmacc.vv v16, v8, v24
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2143,9 +2140,8 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmacc.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 7818e99..3df0763 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -1124,9 +1124,8 @@ define <vscale x 1 x i64> @vmax_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1151,9 +1150,8 @@ define <vscale x 1 x i64> @vmax_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1200,9 +1198,8 @@ define <vscale x 2 x i64> @vmax_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1227,9 +1224,8 @@ define <vscale x 2 x i64> @vmax_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1276,9 +1272,8 @@ define <vscale x 4 x i64> @vmax_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1303,9 +1298,8 @@ define <vscale x 4 x i64> @vmax_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1352,9 +1346,8 @@ define <vscale x 8 x i64> @vmax_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1379,9 +1372,8 @@ define <vscale x 8 x i64> @vmax_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 674b0b8..8147d46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -1123,9 +1123,8 @@ define <vscale x 1 x i64> @vmaxu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1149,8 @@ define <vscale x 1 x i64> @vmaxu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1199,9 +1197,8 @@ define <vscale x 2 x i64> @vmaxu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1226,9 +1223,8 @@ define <vscale x 2 x i64> @vmaxu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1275,9 +1271,8 @@ define <vscale x 4 x i64> @vmaxu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1302,9 +1297,8 @@ define <vscale x 4 x i64> @vmaxu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1351,9 +1345,8 @@ define <vscale x 8 x i64> @vmaxu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1378,9 +1371,8 @@ define <vscale x 8 x i64> @vmaxu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 79631cd..614bd4cb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -1124,9 +1124,8 @@ define <vscale x 1 x i64> @vmin_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1151,9 +1150,8 @@ define <vscale x 1 x i64> @vmin_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1200,9 +1198,8 @@ define <vscale x 2 x i64> @vmin_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1227,9 +1224,8 @@ define <vscale x 2 x i64> @vmin_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1276,9 +1272,8 @@ define <vscale x 4 x i64> @vmin_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1303,9 +1298,8 @@ define <vscale x 4 x i64> @vmin_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1352,9 +1346,8 @@ define <vscale x 8 x i64> @vmin_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1379,9 +1372,8 @@ define <vscale x 8 x i64> @vmin_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index bc93b62..21160553a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -1123,9 +1123,8 @@ define <vscale x 1 x i64> @vminu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1149,8 @@ define <vscale x 1 x i64> @vminu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1199,9 +1197,8 @@ define <vscale x 2 x i64> @vminu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1226,9 +1223,8 @@ define <vscale x 2 x i64> @vminu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1275,9 +1271,8 @@ define <vscale x 4 x i64> @vminu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1302,9 +1297,8 @@ define <vscale x 4 x i64> @vminu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1351,9 +1345,8 @@ define <vscale x 8 x i64> @vminu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1378,9 +1371,8 @@ define <vscale x 8 x i64> @vminu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
index b63098b..f0907e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
@@ -934,9 +934,8 @@ define <vscale x 1 x i64> @vmul_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -961,9 +960,8 @@ define <vscale x 1 x i64> @vmul_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1010,9 +1008,8 @@ define <vscale x 2 x i64> @vmul_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1037,9 +1034,8 @@ define <vscale x 2 x i64> @vmul_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1086,9 +1082,8 @@ define <vscale x 4 x i64> @vmul_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1113,9 +1108,8 @@ define <vscale x 4 x i64> @vmul_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1162,9 +1156,8 @@ define <vscale x 8 x i64> @vmul_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1189,9 +1182,8 @@ define <vscale x 8 x i64> @vmul_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
index 2e0daa6..3484d28 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
@@ -1654,9 +1654,9 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, mu
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1685,9 +1685,9 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64_unmasked(<vscale x 1 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1729,9 +1729,8 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1791,9 +1790,9 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1822,9 +1821,9 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64_unmasked(<vscale x 2 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1866,9 +1865,8 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1928,9 +1926,9 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, mu
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1959,9 +1957,9 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64_unmasked(<vscale x 4 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2003,9 +2001,8 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2067,9 +2064,9 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, mu
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2098,9 +2095,9 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64_unmasked(<vscale x 8 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2143,9 +2140,8 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
index ef281c5..e864d71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
@@ -1326,9 +1326,8 @@ define <vscale x 1 x i64> @vor_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1353,9 +1352,8 @@ define <vscale x 1 x i64> @vor_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1422,9 +1420,8 @@ define <vscale x 2 x i64> @vor_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1449,9 +1446,8 @@ define <vscale x 2 x i64> @vor_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1518,9 +1514,8 @@ define <vscale x 4 x i64> @vor_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1545,9 +1540,8 @@ define <vscale x 4 x i64> @vor_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1614,9 +1608,8 @@ define <vscale x 8 x i64> @vor_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1641,9 +1634,8 @@ define <vscale x 8 x i64> @vor_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask.ll
index d99fd03..ce9d6c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask.ll
@@ -785,8 +785,7 @@ define zeroext i1 @vreduce_and_nxv128i1(<vscale x 128 x i1> %v) {
 ; CHECK-LABEL: vreduce_and_nxv128i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmand.mm v8, v0, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v0, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -814,8 +813,7 @@ define zeroext i1 @vreduce_smax_nxv128i1(<vscale x 128 x i1> %v) {
 ; CHECK-LABEL: vreduce_smax_nxv128i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmand.mm v8, v0, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v0, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -829,8 +827,7 @@ define zeroext i1 @vreduce_umin_nxv128i1(<vscale x 128 x i1> %v) {
 ; CHECK-LABEL: vreduce_umin_nxv128i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmand.mm v8, v0, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v0, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -892,8 +889,7 @@ define zeroext i1 @vreduce_and_nxv256i1(<vscale x 256 x i1> %v) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v0, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -925,8 +921,7 @@ define zeroext i1 @vreduce_smax_nxv256i1(<vscale x 256 x i1> %v) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v0, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -942,8 +937,7 @@ define zeroext i1 @vreduce_umin_nxv256i1(<vscale x 256 x i1> %v) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v0, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1019,8 +1013,7 @@ define zeroext i1 @vreduce_and_nxv512i1(<vscale x 512 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v0, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1060,8 +1053,7 @@ define zeroext i1 @vreduce_smax_nxv512i1(<vscale x 512 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v0, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1081,8 +1073,7 @@ define zeroext i1 @vreduce_umin_nxv512i1(<vscale x 512 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v0, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1186,8 +1177,7 @@ define zeroext i1 @vreduce_and_nxv1024i1(<vscale x 1024 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v15, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1243,8 +1233,7 @@ define zeroext i1 @vreduce_smax_nxv1024i1(<vscale x 1024 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v15, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1272,8 +1261,7 @@ define zeroext i1 @vreduce_umin_nxv1024i1(<vscale x 1024 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v15, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
index 3273274..66ba269 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
@@ -893,9 +893,8 @@ define <vscale x 1 x i64> @vrem_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -920,9 +919,8 @@ define <vscale x 1 x i64> @vrem_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -969,9 +967,8 @@ define <vscale x 2 x i64> @vrem_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -996,9 +993,8 @@ define <vscale x 2 x i64> @vrem_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1045,9 +1041,8 @@ define <vscale x 4 x i64> @vrem_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1072,9 +1067,8 @@ define <vscale x 4 x i64> @vrem_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1121,9 +1115,8 @@ define <vscale x 8 x i64> @vrem_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1148,9 +1141,8 @@ define <vscale x 8 x i64> @vrem_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
index 6b588d0..4608661 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
@@ -892,9 +892,8 @@ define <vscale x 1 x i64> @vremu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -919,9 +918,8 @@ define <vscale x 1 x i64> @vremu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -968,9 +966,8 @@ define <vscale x 2 x i64> @vremu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -995,9 +992,8 @@ define <vscale x 2 x i64> @vremu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1044,9 +1040,8 @@ define <vscale x 4 x i64> @vremu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1071,9 +1066,8 @@ define <vscale x 4 x i64> @vremu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1120,9 +1114,8 @@ define <vscale x 8 x i64> @vremu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1147,9 +1140,8 @@ define <vscale x 8 x i64> @vremu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
index 0f38e94..c41139c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
@@ -842,9 +842,8 @@ define <vscale x 1 x i64> @vrsub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -869,9 +868,8 @@ define <vscale x 1 x i64> @vrsub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -918,9 +916,8 @@ define <vscale x 2 x i64> @vrsub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -945,9 +942,8 @@ define <vscale x 2 x i64> @vrsub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -994,9 +990,8 @@ define <vscale x 4 x i64> @vrsub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1021,9 +1016,8 @@ define <vscale x 4 x i64> @vrsub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1070,9 +1064,8 @@ define <vscale x 8 x i64> @vrsub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1097,9 +1090,8 @@ define <vscale x 8 x i64> @vrsub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v16, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index 575d041..e471f4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -1425,9 +1425,8 @@ define <vscale x 1 x i64> @vsadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1452,9 +1451,8 @@ define <vscale x 1 x i64> @vsadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1521,9 +1519,8 @@ define <vscale x 2 x i64> @vsadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1548,9 +1545,8 @@ define <vscale x 2 x i64> @vsadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1617,9 +1613,8 @@ define <vscale x 4 x i64> @vsadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1644,9 +1639,8 @@ define <vscale x 4 x i64> @vsadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1713,9 +1707,8 @@ define <vscale x 8 x i64> @vsadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1740,9 +1733,8 @@ define <vscale x 8 x i64> @vsadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index c9ed72b..f76a2b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -1424,9 +1424,8 @@ define <vscale x 1 x i64> @vsaddu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1451,9 +1450,8 @@ define <vscale x 1 x i64> @vsaddu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1520,9 +1518,8 @@ define <vscale x 2 x i64> @vsaddu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1547,9 +1544,8 @@ define <vscale x 2 x i64> @vsaddu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1616,9 +1612,8 @@ define <vscale x 4 x i64> @vsaddu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1643,9 +1638,8 @@ define <vscale x 4 x i64> @vsaddu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1712,9 +1706,8 @@ define <vscale x 8 x i64> @vsaddu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1739,9 +1732,8 @@ define <vscale x 8 x i64> @vsaddu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 001f744..c041a16 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -131,10 +131,9 @@ declare <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i7(<vscale x 2 x i7>, <v
 define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i7(<vscale x 2 x i7> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vsitofp_nxv2f16_nxv2i7:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; ZVFH-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; ZVFH-NEXT:    vadd.vv v8, v8, v8
 ; ZVFH-NEXT:    vsra.vi v9, v8, 1
-; ZVFH-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; ZVFH-NEXT:    vfwcvt.f.x.v v8, v9, v0.t
 ; ZVFH-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index c0da928..ebf8d5e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -1468,9 +1468,8 @@ define <vscale x 1 x i64> @vssub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1495,9 +1494,8 @@ define <vscale x 1 x i64> @vssub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1566,9 +1564,8 @@ define <vscale x 2 x i64> @vssub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1593,9 +1590,8 @@ define <vscale x 2 x i64> @vssub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1664,9 +1660,8 @@ define <vscale x 4 x i64> @vssub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1691,9 +1686,8 @@ define <vscale x 4 x i64> @vssub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1762,9 +1756,8 @@ define <vscale x 8 x i64> @vssub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1789,9 +1782,8 @@ define <vscale x 8 x i64> @vssub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index b602f11..d54901c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -1466,9 +1466,8 @@ define <vscale x 1 x i64> @vssubu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1493,9 +1492,8 @@ define <vscale x 1 x i64> @vssubu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1564,9 +1562,8 @@ define <vscale x 2 x i64> @vssubu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1591,9 +1588,8 @@ define <vscale x 2 x i64> @vssubu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1662,9 +1658,8 @@ define <vscale x 4 x i64> @vssubu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1689,9 +1684,8 @@ define <vscale x 4 x i64> @vssubu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1760,9 +1754,8 @@ define <vscale x 8 x i64> @vssubu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1787,9 +1780,8 @@ define <vscale x 8 x i64> @vssubu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
index 65ba791..e28da6b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
@@ -922,9 +922,8 @@ define <vscale x 1 x i64> @vsub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -949,9 +948,8 @@ define <vscale x 1 x i64> @vsub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -998,9 +996,8 @@ define <vscale x 2 x i64> @vsub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1025,9 +1022,8 @@ define <vscale x 2 x i64> @vsub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1074,9 +1070,8 @@ define <vscale x 4 x i64> @vsub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1101,9 +1096,8 @@ define <vscale x 4 x i64> @vsub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1144,8 @@ define <vscale x 8 x i64> @vsub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1177,9 +1170,8 @@ define <vscale x 8 x i64> @vsub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index 06d8519..3d27a1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -124,9 +124,8 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i7(<vscale x 2 x i7> %va, <vscal
 ; ZVFH-LABEL: vuitofp_nxv2f16_nxv2i7:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    li a1, 127
-; ZVFH-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; ZVFH-NEXT:    vand.vx v9, v8, a1
 ; ZVFH-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v8, a1
 ; ZVFH-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; ZVFH-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
index f3dd7ec..1694a7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
@@ -1694,9 +1694,8 @@ define <vscale x 1 x i64> @vxor_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1721,9 +1720,8 @@ define <vscale x 1 x i64> @vxor_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1810,9 +1808,8 @@ define <vscale x 2 x i64> @vxor_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1837,9 +1834,8 @@ define <vscale x 2 x i64> @vxor_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1926,9 +1922,8 @@ define <vscale x 4 x i64> @vxor_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1953,9 +1948,8 @@ define <vscale x 4 x i64> @vxor_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2042,9 +2036,8 @@ define <vscale x 8 x i64> @vxor_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2069,9 +2062,8 @@ define <vscale x 8 x i64> @vxor_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll b/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll
new file mode 100644
index 0000000..c8832bf
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -mtriple=riscv32 -mattr=+m,+xcvmem -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK
+
+define i32 @test_heuristic(ptr %b, i32 %e, i1 %0) {
+; CHECK-LABEL: test_heuristic:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    add a3, a0, a1
+; CHECK-NEXT:    andi a2, a2, 1
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cv.lbu a1, (a3), 1
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    beqz a2, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %1 = getelementptr i8, ptr %b, i32 %e
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  %2 = phi ptr [ %b, %entry ], [ %7, %loop ]
+  %3 = phi ptr [ %1, %entry ], [ %8, %loop ]
+  %4 = load i8, ptr %2, align 1
+  %5 = load i8, ptr %3, align 1
+  %6 = zext i8 %5 to i32
+  %7 = getelementptr i8, ptr %2, i32 1
+  %8 = getelementptr i8, ptr %3, i32 1
+  br i1 %0, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret i32 %6
+}
diff --git a/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll b/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
index f8207c56..5ce4a19 100644
--- a/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
+++ b/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:      %[[#Int:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:      %[[#Int:]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG:  %[[#MemScope_CrossDevice:]] = OpConstant %[[#Int]] 0
 ; CHECK-SPIRV-DAG:  %[[#MemSemEqual_SeqCst:]] = OpConstant %[[#Int]] 16
 ; CHECK-SPIRV-DAG:  %[[#MemSemUnequal_Acquire:]] = OpConstant %[[#Int]] 2
diff --git a/llvm/test/CodeGen/SPIRV/event-zero-const.ll b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
index f3f20a0..523d2ad 100644
--- a/llvm/test/CodeGen/SPIRV/event-zero-const.ll
+++ b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
@@ -4,10 +4,10 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[#LongTy:]] = OpTypeInt 64 0
-; CHECK: %[[#EventTy:]] = OpTypeEvent
-; CHECK: %[[#LongNull:]] = OpConstantNull %[[#LongTy]]
-; CHECK: %[[#EventNull:]] = OpConstantNull %[[#EventTy]]
+; CHECK-DAG: %[[#LongTy:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#EventTy:]] = OpTypeEvent
+; CHECK-DAG: %[[#LongNull:]] = OpConstantNull %[[#LongTy]]
+; CHECK-DAG: %[[#EventNull:]] = OpConstantNull %[[#EventTy]]
 ; CHECK: OpFunction
 ; CHECK: OpINotEqual %[[#]] %[[#]] %[[#LongNull]]
 ; CHECK: OpGroupAsyncCopy %[[#EventTy]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#EventNull]]
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
index 368c5d4..80309e9 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
@@ -1,16 +1,17 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
 ; CHECK-DAG: OpName %[[I9:.*]] "_ZN13BaseIncrement9incrementEPi"
 ; CHECK-DAG: OpName %[[I29:.*]] "_ZN12IncrementBy29incrementEPi"
 ; CHECK-DAG: OpName %[[I49:.*]] "_ZN12IncrementBy49incrementEPi"
 ; CHECK-DAG: OpName %[[I89:.*]] "_ZN12IncrementBy89incrementEPi"
+; CHECK-DAG: OpName %[[Foo:.*]] "foo"
 
 ; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
-; CHECK-DAG: %[[TyArr:.*]] = OpTypeArray
+; CHECK-DAG: %[[TyInt32:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[Const8:.*]] = OpConstant %[[TyInt32]] 8
+; CHECK-DAG: %[[TyArr:.*]] = OpTypeArray %[[TyInt8]] %[[Const8]]
 ; CHECK-DAG: %[[TyStruct1:.*]] = OpTypeStruct %[[TyArr]]
 ; CHECK-DAG: %[[TyStruct2:.*]] = OpTypeStruct %[[TyStruct1]]
 ; CHECK-DAG: %[[TyPtrStruct2:.*]] = OpTypePointer Generic %[[TyStruct2]]
@@ -18,16 +19,21 @@
 ; CHECK-DAG: %[[TyPtrFun:.*]] = OpTypePointer Generic %[[TyFun]]
 ; CHECK-DAG: %[[TyPtrPtrFun:.*]] = OpTypePointer Generic %[[TyPtrFun]]
 
-; CHECK: %[[I9]] = OpFunction
-; CHECK: %[[I29]] = OpFunction
-; CHECK: %[[I49]] = OpFunction
-; CHECK: %[[I89]] = OpFunction
+; CHECK-DAG: %[[I9]] = OpFunction
+; CHECK-DAG: %[[I29]] = OpFunction
+; CHECK-DAG: %[[I49]] = OpFunction
+; CHECK-DAG: %[[I89]] = OpFunction
+
+; CHECK: %[[Foo]] = OpFunction
+; CHECK-4: OpFunctionParameter
 
 ; CHECK: %[[Arg1:.*]] = OpPhi %[[TyPtrStruct2]]
 ; CHECK: %[[VTbl:.*]] = OpBitcast %[[TyPtrPtrFun]] %[[#]]
 ; CHECK: %[[FP:.*]] = OpLoad %[[TyPtrFun]] %[[VTbl]]
 ; CHECK: %[[#]] = OpFunctionPointerCallINTEL %[[TyVoid]] %[[FP]] %[[Arg1]] %[[#]]
 
+; CHECK-NO: OpFunction
+
 %"cls::id" = type { %"cls::detail::array" }
 %"cls::detail::array" = type { [1 x i64] }
 %struct.obj_storage_t = type { %"struct.aligned_storage<BaseIncrement, IncrementBy2, IncrementBy4, IncrementBy8>::type" }
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
index 75ad382..b96da63 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
@@ -1,9 +1,6 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
 ; CHECK-DAG: OpCapability FunctionPointersINTEL
 ; CHECK-DAG: OpCapability Int64
 ; CHECK: OpExtension "SPV_INTEL_function_pointers"
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
index d38de21..8edecc1 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
@@ -2,15 +2,9 @@
 ; pointers/PtrCast-null-in-OpSpecConstantOp.ll (that is OpSpecConstantOp with ptr-cast operation) correctly
 ; work also for function pointers.
 
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
-; Running with -verify-machineinstrs would lead to "Reading virtual register without a def"
-; error, because OpConstantFunctionPointerINTEL forward-refers to a function definition.
-
 ; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] 121 %[[#]]
 ; CHECK-COUNT-3: OpPtrCastToGeneric
 
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
index e006651..91286d5 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
@@ -31,20 +31,20 @@
 ; CHECK-DAG: %[[#Const123:]] = OpConstant %[[#Int32Ty]] 123
 ; CHECK-DAG: %[[#Const42:]] = OpConstant %[[#DoubleTy:]] 42
 
-; CHECK: %[[#Dialect:]] = OpAsmTargetINTEL "spirv64-unknown-unknown"
+; CHECK-DAG: %[[#Dialect:]] = OpAsmTargetINTEL "spirv64-unknown-unknown"
 ; CHECK-NO: OpAsmTargetINTEL
 
-; CHECK: %[[#Asm1:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" ""
-; CHECK: %[[#Asm2:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "nop" ""
-; CHECK: %[[#Asm3:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" "~{cc},~{memory}"
-; CHECK: %[[#Asm4:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun2Ty:]] %[[#Dialect]] "clobber_out $0" "=&r"
-; CHECK: %[[#Asm5:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun3Ty]] %[[#Dialect]] "icmd $0 $1" "=r,r"
-; CHECK: %[[#Asm6:]] = OpAsmINTEL %[[#FloatTy]] %[[#Fun4Ty]] %[[#Dialect]] "fcmd $0 $1" "=r,r"
-; CHECK: %[[#Asm7:]] = OpAsmINTEL %[[#HalfTy]] %[[#Fun5Ty]] %[[#Dialect]] "fcmdext $0 $1 $2" "=r,r,r"
-; CHECK: %[[#Asm8:]] = OpAsmINTEL %[[#Int8Ty]] %[[#Fun6Ty]] %[[#Dialect]] "cmdext $0 $3 $1 $2" "=r,r,r,r"
-; CHECK: %[[#Asm9:]] = OpAsmINTEL %[[#Int64Ty]] %[[#Fun7Ty]] %[[#Dialect]] "icmdext $0 $3 $1 $2" "=r,r,r,r"
-; CHECK: %[[#Asm10:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "r,r"
-; CHECK: %[[#Asm11:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "i,i"
+; CHECK-DAG: %[[#Asm1:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" ""
+; CHECK-DAG: %[[#Asm2:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "nop" ""
+; CHECK-DAG: %[[#Asm3:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" "~{cc},~{memory}"
+; CHECK-DAG: %[[#Asm4:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun2Ty:]] %[[#Dialect]] "clobber_out $0" "=&r"
+; CHECK-DAG: %[[#Asm5:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun3Ty]] %[[#Dialect]] "icmd $0 $1" "=r,r"
+; CHECK-DAG: %[[#Asm6:]] = OpAsmINTEL %[[#FloatTy]] %[[#Fun4Ty]] %[[#Dialect]] "fcmd $0 $1" "=r,r"
+; CHECK-DAG: %[[#Asm7:]] = OpAsmINTEL %[[#HalfTy]] %[[#Fun5Ty]] %[[#Dialect]] "fcmdext $0 $1 $2" "=r,r,r"
+; CHECK-DAG: %[[#Asm8:]] = OpAsmINTEL %[[#Int8Ty]] %[[#Fun6Ty]] %[[#Dialect]] "cmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK-DAG: %[[#Asm9:]] = OpAsmINTEL %[[#Int64Ty]] %[[#Fun7Ty]] %[[#Dialect]] "icmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK-DAG: %[[#Asm10:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "r,r"
+; CHECK-DAG: %[[#Asm11:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "i,i"
 ; CHECK-NO: OpAsmINTEL
 
 ; CHECK: OpFunction
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
index 8ecd0a2..bd07ba1 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -6,9 +6,11 @@
 
 ; CHECK: OpCapability ShaderClockKHR
 ; CHECK: OpExtension "SPV_KHR_shader_clock"
-; CHECK-DAG: [[uint:%[a-z0-9_]+]] = OpTypeInt 32
+; CHECK-DAG: [[uint:%[a-z0-9_]+]] = OpTypeInt 32 0
 ; CHECK-DAG: [[ulong:%[a-z0-9_]+]] = OpTypeInt 64
 ; CHECK-DAG: [[v2uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 2
+; CHECK-DAG: OpConstant [[uint]] 8
+; CHECK-DAG: OpConstant [[uint]] 16
 ; CHECK-DAG: [[uint_1:%[a-z0-9_]+]] = OpConstant [[uint]] 1
 ; CHECK-DAG: [[uint_2:%[a-z0-9_]+]] = OpConstant [[uint]] 2
 ; CHECK-DAG: [[uint_3:%[a-z0-9_]+]] = OpConstant [[uint]] 3
diff --git a/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll b/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll
new file mode 100644
index 0000000..4501819
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll
@@ -0,0 +1,59 @@
+; Check names and decoration of global variables.
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[#id18:]] "G1"
+; CHECK-DAG: OpName %[[#id22:]] "g1"
+; CHECK-DAG: OpName %[[#id23:]] "g2"
+; CHECK-DAG: OpName %[[#id27:]] "g4"
+; CHECK-DAG: OpName %[[#id30:]] "c1"
+; CHECK-DAG: OpName %[[#id31:]] "n_t"
+; CHECK-DAG: OpName %[[#id32:]] "w"
+; CHECK-DAG: OpName %[[#id34:]] "a.b"
+; CHECK-DAG: OpName %[[#id35:]] "e"
+; CHECK-DAG: OpName %[[#id36:]] "y.z"
+; CHECK-DAG: OpName %[[#id38:]] "x"
+
+; CHECK-NOT: OpDecorate %[[#id18]] LinkageAttributes
+; CHECK-DAG: OpDecorate %[[#id18]] Constant
+; CHECK-DAG: OpDecorate %[[#id22]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id22]] LinkageAttributes "g1" Export
+; CHECK-DAG: OpDecorate %[[#id23]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id27]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id27]] LinkageAttributes "g4" Export
+; CHECK-DAG: OpDecorate %[[#id30]] Constant
+; CHECK-DAG: OpDecorate %[[#id30]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id30]] LinkageAttributes "c1" Export
+; CHECK-DAG: OpDecorate %[[#id31]] Constant
+; CHECK-DAG: OpDecorate %[[#id31]] LinkageAttributes "n_t" Import
+; CHECK-DAG: OpDecorate %[[#id32]] Constant
+; CHECK-DAG: OpDecorate %[[#id32]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id32]] LinkageAttributes "w" Export
+; CHECK-DAG: OpDecorate %[[#id34]] Constant
+; CHECK-DAG: OpDecorate %[[#id34]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id35]] LinkageAttributes "e" Import
+; CHECK-DAG: OpDecorate %[[#id36]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id38]] Constant
+; CHECK-DAG: OpDecorate %[[#id38]] Alignment 4
+
+%"class.sycl::_V1::nd_item" = type { i8 }
+
+@G1 = private unnamed_addr addrspace(1) constant %"class.sycl::_V1::nd_item" poison, align 1
+@g1 = addrspace(1) global i32 1, align 4
+@g2 = internal addrspace(1) global i32 2, align 4
+@g4 = common addrspace(1) global i32 0, align 4
+@c1 = addrspace(2) constant [2 x i32] [i32 0, i32 1], align 4
+@n_t = external addrspace(2) constant [256 x i32]
+@w = addrspace(1) constant i32 0, align 4
+@a.b = internal addrspace(2) constant [2 x i32] [i32 2, i32 3], align 4
+@e = external addrspace(1) global i32
+@y.z = internal addrspace(1) global i32 0, align 4
+@x = internal addrspace(2) constant float 1.000000e+00, align 4
+
+define internal spir_func void @foo(ptr addrspace(4) align 1 %arg) {
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll
new file mode 100644
index 0000000..92947f7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll
@@ -0,0 +1,52 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG:        %[[#int:]] = OpTypeInt 32 0
+; CHECK-DAG:        %[[#v3int:]] = OpTypeVector %[[#int]] 3
+; CHECK-DAG:        %[[#ptr_Input_v3int:]] = OpTypePointer Input %[[#v3int]]
+; CHECK-DAG:        %[[#tempvar:]] = OpUndef %[[#v3int]]
+; CHECK-DAG:        %[[#WorkgroupId:]] = OpVariable %[[#ptr_Input_v3int]] Input
+
+; CHECK-DAG:        OpEntryPoint GLCompute {{.*}} %[[#WorkgroupId]]
+; CHECK-DAG:        OpName %[[#WorkgroupId]] "__spirv_BuiltInWorkgroupId"
+; CHECK-DAG:        OpDecorate %[[#WorkgroupId]] LinkageAttributes "__spirv_BuiltInWorkgroupId" Import
+; CHECK-DAG:        OpDecorate %[[#WorkgroupId]] BuiltIn WorkgroupId
+
+target triple = "spirv-unknown-vulkan-library"
+
+declare void @group_id_user(<3 x i32>)
+
+; Function Attrs: convergent noinline norecurse
+define void @main() #1 {
+entry:
+
+; CHECK:        %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]]
+; CHECK:        %[[#load0:]] = OpCompositeExtract %[[#int]] %[[#load]] 0
+  %1 = call i32 @llvm.spv.group.id(i32 0)
+
+; CHECK:        %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load0]] %[[#tempvar]]
+  %2 = insertelement <3 x i32> poison, i32 %1, i64 0
+
+; CHECK:        %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]]
+; CHECK:        %[[#load1:]] = OpCompositeExtract %[[#int]] %[[#load]] 1
+  %3 = call i32 @llvm.spv.group.id(i32 1)
+
+; CHECK:        %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load1]] %[[#tempvar]] 1
+  %4 = insertelement <3 x i32> %2, i32 %3, i64 1
+
+; CHECK:        %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]]
+; CHECK:        %[[#load2:]] = OpCompositeExtract %[[#int]] %[[#load]] 2
+  %5 = call i32 @llvm.spv.group.id(i32 2)
+
+; CHECK:        %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load2]] %[[#tempvar]] 2
+  %6 = insertelement <3 x i32> %4, i32 %5, i64 2
+
+  call spir_func void @group_id_user(<3 x i32> %6)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare i32 @llvm.spv.group.id(i32) #3
+
+attributes #1 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
index 2e0eb8c..b1625c0 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
@@ -15,7 +15,7 @@ entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]]
   ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]]
   ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
-  %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b)
+  %hlsl.cross = call <3 x half> @llvm.spv.cross.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %hlsl.cross
 }
 
@@ -25,9 +25,9 @@ entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]]
   ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]]
   ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
-  %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b)
+  %hlsl.cross = call <3 x float> @llvm.spv.cross.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %hlsl.cross
 }
 
-declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>)
-declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>)
+declare <3 x half> @llvm.spv.cross.v3f16(<3 x half>, <3 x half>)
+declare <3 x float> @llvm.spv.cross.v3f32(<3 x float>, <3 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll
new file mode 100644
index 0000000..85a24a01
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll
@@ -0,0 +1,33 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for distance are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef half @distance_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call half @llvm.spv.distance.f16(<4 x half> %a, <4 x half> %b)
+  ret half %spv.distance
+}
+
+define noundef float @distance_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call float @llvm.spv.distance.f32(<4 x float> %a, <4 x float> %b)
+  ret float %spv.distance
+}
+
+declare half @llvm.spv.distance.f16(<4 x half>, <4 x half>)
+declare float @llvm.spv.distance.f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
index b4a9d8e0..1ac862b 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
@@ -11,19 +11,21 @@
 
 define noundef half @length_half4(<4 x half> noundef %a) {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
   ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]]
-  %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a)
+  %hlsl.length = call half @llvm.spv.length.f16(<4 x half> %a)
   ret half %hlsl.length
 }
 
 define noundef float @length_float4(<4 x float> noundef %a) {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
   ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]]
-  %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a)
+  %hlsl.length = call float @llvm.spv.length.f32(<4 x float> %a)
   ret float %hlsl.length
 }
 
-declare half @llvm.spv.length.v4f16(<4 x half>)
-declare float @llvm.spv.length.v4f32(<4 x float>)
+declare half @llvm.spv.length.f16(<4 x half>)
+declare float @llvm.spv.length.f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll b/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
index 8f14eba2..49aaa45 100644
--- a/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
+++ b/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
@@ -25,9 +25,7 @@
 ; CHECK-SPIRV-DAG:                    [[v4uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 4
 ; CHECK-SPIRV-DAG:                 [[vecstruct:%[a-z0-9_]+]] = OpTypeStruct [[v4uint]] [[v4uint]]
 ; CHECK-SPIRV-DAG:   [[_ptr_Function_vecstruct:%[a-z0-9_]+]] = OpTypePointer Function [[vecstruct]]
-; CHECK-SPIRV-DAG:               [[struct_anon:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[uint]]
-; CHECK-SPIRV-DAG: [[_ptr_Function_struct_anon:%[a-z0-9_]+]] = OpTypePointer Function [[struct_anon]]
-; CHECK-SPIRV-DAG:  [[_ptr_Generic_struct_anon:%[a-z0-9_]+]] = OpTypePointer Generic [[struct_anon]]
+; CHECK-SPIRV-DAG:    [[_ptr_Generic_i32struct:%[a-z0-9_]+]] = OpTypePointer Generic [[i32struct]]
 
 define spir_func void @test_builtin_iaddcarrycc(i8 %a, i8 %b) {
   entry:
@@ -116,9 +114,9 @@ define spir_func void @test_builtin_iaddcarry_anon(i32 %a, i32 %b) {
 ; CHECK-SPIRV:        [[a_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:        [[b_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:    [[entry_4:%[a-z0-9_]+]] = OpLabel
-; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_struct_anon]] Function
-; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_struct_anon]] [[var_59]]
-; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpIAddCarry [[struct_anon]] [[a_4]] [[b_4]]
+; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_i32struct]] Function
+; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_i32struct]] [[var_59]]
+; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpIAddCarry [[i32struct]] [[a_4]] [[b_4]]
 ; CHECK-SPIRV:                              OpStore [[var_61]] [[var_62]]
 
 declare void @_Z17__spirv_IAddCarryIiiE4anonIT_T0_ES1_S2_(ptr addrspace(4) sret(%struct.anon) align 4, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/image-unoptimized.ll b/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
index 0ce9c73..d7d5b1d 100644
--- a/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
+++ b/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 
-; CHECK:     %[[#TypeImage:]] = OpTypeImage
-; CHECK:     %[[#TypeSampler:]] = OpTypeSampler
+; CHECK-DAG: %[[#TypeImage:]] = OpTypeImage
+; CHECK-DAG: %[[#TypeSampler:]] = OpTypeSampler
 ; CHECK-DAG: %[[#TypeImagePtr:]] = OpTypePointer {{.*}} %[[#TypeImage]]
 ; CHECK-DAG: %[[#TypeSamplerPtr:]] = OpTypePointer {{.*}} %[[#TypeSampler]]
 
diff --git a/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll b/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
index 08b4d2a..ca842d2f 100644
--- a/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
+++ b/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
@@ -23,9 +23,7 @@
 ; CHECK-SPIRV-DAG:                    [[v4uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 4
 ; CHECK-SPIRV-DAG:                 [[vecstruct:%[a-z0-9_]+]] = OpTypeStruct [[v4uint]] [[v4uint]]
 ; CHECK-SPIRV-DAG:   [[_ptr_Function_vecstruct:%[a-z0-9_]+]] = OpTypePointer Function [[vecstruct]]
-; CHECK-SPIRV-DAG:               [[struct_anon:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[uint]]
-; CHECK-SPIRV-DAG: [[_ptr_Function_struct_anon:%[a-z0-9_]+]] = OpTypePointer Function [[struct_anon]]
-; CHECK-SPIRV-DAG:  [[_ptr_Generic_struct_anon:%[a-z0-9_]+]] = OpTypePointer Generic [[struct_anon]]
+; CHECK-SPIRV-DAG:    [[_ptr_Generic_i32struct:%[a-z0-9_]+]] = OpTypePointer Generic [[i32struct]]
 
 define spir_func void @test_builtin_isubborrowcc(i8 %a, i8 %b) {
   entry:
@@ -114,9 +112,9 @@ define spir_func void @test_builtin_isubborrow_anon(i32 %a, i32 %b) {
 ; CHECK-SPIRV:        [[a_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:        [[b_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:    [[entry_4:%[a-z0-9_]+]] = OpLabel
-; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_struct_anon]] Function
-; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_struct_anon]] [[var_59]]
-; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpISubBorrow [[struct_anon]] [[a_4]] [[b_4]]
+; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_i32struct]] Function
+; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_i32struct]] [[var_59]]
+; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpISubBorrow [[i32struct]] [[a_4]] [[b_4]]
 ; CHECK-SPIRV:                              OpStore [[var_61]] [[var_62]]
 
 declare void @_Z18__spirv_ISubBorrowIiiE4anonIT_T0_ES1_S2_(ptr addrspace(4) sret(%struct.anon) align 4, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll b/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
index 0dc8623..61d06fe 100644
--- a/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
+++ b/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
@@ -3,9 +3,9 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-SPIRV: %[[#Int:]] = OpTypeInt 32 0
-; CHECK-SPIRV: %[[#C0:]] = OpConstant %[[#Int]] 0
-; CHECK-SPIRV: %[[#C1:]] = OpConstant %[[#Int]] 1
+; CHECK-SPIRV-DAG: %[[#Int:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#C0:]] = OpConstant %[[#Int]] 0
+; CHECK-SPIRV-DAG: %[[#C1:]] = OpConstant %[[#Int]] 1
 ; CHECK-SPIRV: OpSelect %[[#Int]] %[[#]] %[[#C1]] %[[#C0]]
 
 
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
index 2d5b309..25b5304 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
@@ -1,21 +1,21 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_32:]] "spirv.llvm_fshl_i32"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_16:]] "spirv.llvm_fshl_i16"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_VEC_INT_16:]] "spirv.llvm_fshl_v2i16"
-; CHECK-SPIRV:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
-; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
-; CHECK-SPIRV:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
-; CHECK-SPIRV-DAG: %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_32:]] "spirv.llvm_fshl_i32"
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_16:]] "spirv.llvm_fshl_i16"
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_VEC_INT_16:]] "spirv.llvm_fshl_v2i16"
+; CHECK-SPIRV-DAG:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
+; CHECK-SPIRV-DAG:     %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
 
 ; CHECK-SPIRV: %[[#]] = OpFunction %[[#TYPE_INT_32]] {{.*}} %[[#TYPE_ORIG_FUNC_32]]
 ; CHECK-SPIRV: %[[#X:]] = OpFunctionParameter %[[#TYPE_INT_32]]
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
index 4cf5ca5..55fb2d9 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
@@ -1,20 +1,20 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_32:]] "spirv.llvm_fshr_i32"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_16:]] "spirv.llvm_fshr_i16"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_VEC_INT_16:]] "spirv.llvm_fshr_v2i16"
-; CHECK-SPIRV:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_32:]] "spirv.llvm_fshr_i32"
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_16:]] "spirv.llvm_fshr_i16"
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_VEC_INT_16:]] "spirv.llvm_fshr_v2i16"
+; CHECK-SPIRV-DAG: %[[#TYPE_INT_32:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG: %[[#TYPE_INT_16:]] = OpTypeInt 16 0
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
 ; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
 ; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
-; CHECK-SPIRV:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
+; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
 ; CHECK-SPIRV-DAG: %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
 
 ; CHECK-SPIRV: %[[#]] = OpFunction %[[#TYPE_INT_32]] {{.*}} %[[#TYPE_ORIG_FUNC_32]]
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
index e7a9869..d5e70ae 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
@@ -12,17 +12,17 @@
 ; CHECK-DAG: %[[#Int8Ptr:]] = OpTypePointer Generic %[[#Int8]]
 
 ; CHECK-DAG: %[[#Const4:]] = OpConstant %[[#Int32]] 4
-; CHECK: %[[#Int8x4:]] = OpTypeArray %[[#Int8]] %[[#Const4]]
+; CHECK-DAG: %[[#Int8x4:]] = OpTypeArray %[[#Int8]] %[[#Const4]]
 
 ; CHECK-DAG: %[[#Const12:]] = OpConstant %[[#Int32]] 12
-; CHECK: %[[#Int8x12:]] = OpTypeArray %[[#Int8]] %[[#Const12]]
+; CHECK-DAG: %[[#Int8x12:]] = OpTypeArray %[[#Int8]] %[[#Const12]]
 
 ; CHECK-DAG: %[[#Const21:]] = OpConstant %[[#Int8]] 21
 ; CHECK-DAG: %[[#False:]] = OpConstantFalse %[[#]]
 ; CHECK-DAG: %[[#ConstComp:]] = OpConstantComposite %[[#Int8x4]] %[[#Const21]] %[[#Const21]] %[[#Const21]] %[[#Const21]]
 ; CHECK-DAG: %[[#ConstNull:]] = OpConstantNull %[[#Int8x12]]
-; CHECK: %[[#VarComp:]] = OpVariable %[[#]] UniformConstant %[[#ConstComp]]
-; CHECK: %[[#VarNull:]] = OpVariable %[[#]] UniformConstant %[[#ConstNull]]
+; CHECK-DAG: %[[#VarComp:]] = OpVariable %[[#]] UniformConstant %[[#ConstComp]]
+; CHECK-DAG: %[[#VarNull:]] = OpVariable %[[#]] UniformConstant %[[#ConstNull]]
 
 ; CHECK-DAG: %[[#Int8PtrConst:]] = OpTypePointer UniformConstant %[[#Int8]]
 ; CHECK: OpCopyMemorySized %[[#Target:]] %[[#Source:]] %[[#Const12]] Aligned 4
diff --git a/llvm/test/CodeGen/SPIRV/logical-access-chain.ll b/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
index 39f6d33..d56678e 100644
--- a/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
+++ b/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
 
-; CHECK:      [[uint:%[0-9]+]] = OpTypeInt 32 0
-; CHECK:     [[uint2:%[0-9]+]] = OpTypeVector [[uint]] 2
-; CHECK:    [[uint_1:%[0-9]+]] = OpConstant [[uint]] 1
-; CHECK:  [[ptr_uint:%[0-9]+]] = OpTypePointer Function [[uint]]
-; CHECK: [[ptr_uint2:%[0-9]+]] = OpTypePointer Function [[uint2]]
+; CHECK-DAG:      [[uint:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG:     [[uint2:%[0-9]+]] = OpTypeVector [[uint]] 2
+; CHECK-DAG:    [[uint_1:%[0-9]+]] = OpConstant [[uint]] 1
+; CHECK-DAG:  [[ptr_uint:%[0-9]+]] = OpTypePointer Function [[uint]]
+; CHECK-DAG: [[ptr_uint2:%[0-9]+]] = OpTypePointer Function [[uint2]]
 
 define void @main() #1 {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/opencl/degrees.ll b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
index 88f9783..b8d4f52 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
@@ -3,7 +3,7 @@
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#op_ext_ocl:]] = OpExtInstImport "OpenCL.std"
 
 ; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
@@ -20,7 +20,7 @@ declare <4 x half> @llvm.spv.degrees.v4f16(<4 x half>)
 define noundef float @degrees_float(float noundef %a) {
 entry:
 ; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] degrees %[[#float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_ocl]] degrees %[[#float_32_arg]]
   %elt.degrees = call float @llvm.spv.degrees.f32(float %a)
   ret float %elt.degrees
 }
@@ -28,7 +28,7 @@ entry:
 define noundef half @degrees_half(half noundef %a) {
 entry:
 ; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] degrees %[[#float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_ocl]] degrees %[[#float_16_arg]]
   %elt.degrees = call half @llvm.spv.degrees.f16(half %a)
   ret half %elt.degrees
 }
@@ -36,7 +36,7 @@ entry:
 define noundef <4 x float> @degrees_float_vector(<4 x float> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] degrees %[[#vec4_float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_ocl]] degrees %[[#vec4_float_32_arg]]
   %elt.degrees = call <4 x float> @llvm.spv.degrees.v4f32(<4 x float> %a)
   ret <4 x float> %elt.degrees
 }
@@ -44,7 +44,7 @@ entry:
 define noundef <4 x half> @degrees_half_vector(<4 x half> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] degrees %[[#vec4_float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_ocl]] degrees %[[#vec4_float_16_arg]]
   %elt.degrees = call <4 x half> @llvm.spv.degrees.v4f16(<4 x half> %a)
   ret <4 x half> %elt.degrees
 }
diff --git a/llvm/test/CodeGen/SPIRV/opencl/distance.ll b/llvm/test/CodeGen/SPIRV/opencl/distance.ll
new file mode 100644
index 0000000..ac18804
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/distance.ll
@@ -0,0 +1,34 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#op_ext_cl:]] = OpExtInstImport "OpenCL.std"
+
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef half @distance_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_cl]] distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call half @llvm.spv.distance.f16(<4 x half> %a, <4 x half> %b)
+  ret half %spv.distance
+}
+
+define noundef float @distance_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_cl]] distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call float @llvm.spv.distance.f32(<4 x float> %a, <4 x float> %b)
+  ret float %spv.distance
+}
+
+declare half @llvm.spv.distance.f16(<4 x half>, <4 x half>)
+declare float @llvm.spv.distance.f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/radians.ll b/llvm/test/CodeGen/SPIRV/opencl/radians.ll
index f7bb8d5..5b4f26a 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/radians.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/radians.ll
@@ -3,7 +3,7 @@
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#op_ext_ocl:]] = OpExtInstImport "OpenCL.std"
 
 ; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
@@ -20,7 +20,7 @@ declare <4 x half> @llvm.spv.radians.v4f16(<4 x half>)
 define noundef float @radians_float(float noundef %a) {
 entry:
 ; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] radians %[[#float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_ocl]] radians %[[#float_32_arg]]
   %elt.radians = call float @llvm.spv.radians.f32(float %a)
   ret float %elt.radians
 }
@@ -28,7 +28,7 @@ entry:
 define noundef half @radians_half(half noundef %a) {
 entry:
 ; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] radians %[[#float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_ocl]] radians %[[#float_16_arg]]
   %elt.radians = call half @llvm.spv.radians.f16(half %a)
   ret half %elt.radians
 }
@@ -36,7 +36,7 @@ entry:
 define noundef <4 x float> @radians_float_vector(<4 x float> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] radians %[[#vec4_float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_ocl]] radians %[[#vec4_float_32_arg]]
   %elt.radians = call <4 x float> @llvm.spv.radians.v4f32(<4 x float> %a)
   ret <4 x float> %elt.radians
 }
@@ -44,7 +44,7 @@ entry:
 define noundef <4 x half> @radians_half_vector(<4 x half> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] radians %[[#vec4_float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_ocl]] radians %[[#vec4_float_16_arg]]
   %elt.radians = call <4 x half> @llvm.spv.radians.v4f16(<4 x half> %a)
   ret <4 x half> %elt.radians
 }
diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
index 99e2c3e..dee16da 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
@@ -5,10 +5,8 @@
 ; CHECK-DAG: %[[Struct:.*]] = OpTypeStruct %[[Array]]
 ; CHECK-DAG: %[[Zero:.*]] = OpTypeInt 64 0
 ; CHECK-DAG: %[[Null:.*]] = OpConstantNull %[[Zero]]
-; CHECK-DAG: %[[R1:.*]] = OpConstantComposite %[[Array]] %[[Null]]
-; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R1]]
-; CHECK-DAG: %[[R2:.*]] = OpConstantComposite %[[Array]] %[[Null]]
-; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R2]]
+; CHECK-DAG: %[[R:.*]] = OpConstantComposite %[[Array]] %[[Null]]
+; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R]]
 
 @G1 = addrspace(1) constant { [1 x ptr addrspace(4)] } { [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4))] }
 @G2 = addrspace(1) constant { [1 x ptr addrspace(4)] } { [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4))] }
diff --git a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
index 03ecf5e..59a2423 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
@@ -1,12 +1,12 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[TyInt64:.*]] = OpTypeInt 64 0
-; CHECK: %[[TyInt64Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt64]]
-; CHECK: %[[TyStruct:.*]] = OpTypeStruct %[[TyInt64Ptr]] %[[TyInt64Ptr]]
-; CHECK: %[[ConstStruct:.*]] = OpConstantComposite %[[TyStruct]] %[[ConstField:.*]] %[[ConstField]]
-; CHECK: %[[TyStructPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyStruct]]
-; CHECK: OpVariable %[[TyStructPtr]] {{[a-zA-Z]+}} %[[ConstStruct]]
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyInt64Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt64]]
+; CHECK-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyInt64Ptr]] %[[TyInt64Ptr]]
+; CHECK-DAG: %[[ConstStruct:.*]] = OpConstantComposite %[[TyStruct]] %[[ConstField:.*]] %[[ConstField]]
+; CHECK-DAG: %[[TyStructPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyStruct]]
+; CHECK-DAG: OpVariable %[[TyStructPtr]] {{[a-zA-Z]+}} %[[ConstStruct]]
 
 @a = addrspace(1) constant i64 42
 @struct = addrspace(1) global {ptr addrspace(1), ptr addrspace(1)} { ptr addrspace(1) @a, ptr addrspace(1) @a }
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll b/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
index e4c7bdb..8a90e40 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
@@ -24,12 +24,10 @@
 ; CHECK-SPIRV: OpName %[[#sample_kernel_float:]] "sample_kernel_float"
 ; CHECK-SPIRV: OpName %[[#sample_kernel_int:]] "sample_kernel_int"
 
-; CHECK-SPIRV:     %[[#TypeSampler:]] = OpTypeSampler
+; CHECK-SPIRV-DAG: %[[#TypeSampler:]] = OpTypeSampler
 ; CHECK-SPIRV-DAG: %[[#SampledImageTy:]] = OpTypeSampledImage
 ; CHECK-SPIRV-DAG: %[[#ConstSampler1:]] = OpConstantSampler %[[#TypeSampler]] None 0 Linear
 ; CHECK-SPIRV-DAG: %[[#ConstSampler2:]] = OpConstantSampler %[[#TypeSampler]] Repeat 0 Nearest
-; CHECK-SPIRV-DAG: %[[#ConstSampler3:]] = OpConstantSampler %[[#TypeSampler]] None 0 Linear
-; CHECK-SPIRV-DAG: %[[#ConstSampler4:]] = OpConstantSampler %[[#TypeSampler]] Repeat 0 Nearest
 
 ; CHECK-SPIRV: %[[#sample_kernel_float]] = OpFunction %{{.*}}
 ; CHECK-SPIRV: %[[#InputImage:]] = OpFunctionParameter %{{.*}}
@@ -65,13 +63,13 @@ declare spir_func target("spirv.Sampler") @__translate_sampler_initializer(i32)
 ; CHECK-SPIRV: %[[#InputImage:]] = OpFunctionParameter %{{.*}}
 ; CHECK-SPIRV: %[[#argSampl:]] = OpFunctionParameter %[[#TypeSampler]]
 
-; CHECK-SPIRV: %[[#SampledImage4:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler3]]
+; CHECK-SPIRV: %[[#SampledImage4:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler1]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage4]]
 
 ; CHECK-SPIRV: %[[#SampledImage5:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#argSampl]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage5]]
 
-; CHECK-SPIRV: %[[#SampledImage6:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler4]]
+; CHECK-SPIRV: %[[#SampledImage6:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler2]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage6]]
 
 define dso_local spir_kernel void @sample_kernel_int(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, <2 x float> noundef %coords, <4 x i32> addrspace(1)* nocapture noundef writeonly %results, target("spirv.Sampler") %argSampl) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
index 8b326e2..55f1125 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
@@ -39,7 +39,7 @@
 ; CHECK-SPIRV-DAG: %[[#SAMP:]] = OpTypeSampler
 ; CHECK-SPIRV-DAG: %[[#SAMPIMG:]] = OpTypeSampledImage %[[#IMG2D_RD]]
 
-; CHECK-SPIRV:     %[[#SAMP_CONST:]] = OpConstantSampler %[[#SAMP]] None 0 Linear
+; CHECK-SPIRV-DAG: %[[#SAMP_CONST:]] = OpConstantSampler %[[#SAMP]] None 0 Linear
 
 ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#PIPE_RD]]
 ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#PIPE_WR]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
index 74dbaab..5810d9c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
@@ -14,19 +14,19 @@
 ; CHECK-SPIRV-DAG:    %[[#twelve:]] = OpConstant %[[#i32]] 12
 ; CHECK-SPIRV-DAG:    %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]]
 
-; CHECK-SPIRV:        %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
-; CHECK-SPIRV:        %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+; CHECK-SPIRV-DAG:    %[[#test_arr1:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+; CHECK-SPIRV-DAG:    %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
 
 ; CHECK-SPIRV-DAG:    %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]]
 
-; CHECK-SPIRV:        %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function
+; CHECK-SPIRV:        %[[#arr1:]] = OpVariable %[[#i32x3_ptr]] Function
 ; CHECK-SPIRV:        %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function
 
-; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelve]] Aligned 4
+; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr1]] %[[#test_arr1]] %[[#twelve]] Aligned 4
 ; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelve]] Aligned 4
 
 ; CHECK-SPIRV-64:     %[[#twelvezext1:]] = OpUConvert %[[#i64:]] %[[#twelve:]]
-; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelvezext1]] Aligned 4
+; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr1]] %[[#test_arr1]] %[[#twelvezext1]] Aligned 4
 ; CHECK-SPIRV-64:     %[[#twelvezext2:]] = OpUConvert %[[#i64:]] %[[#twelve:]]
 ; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelvezext2]] Aligned 4
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
index adf73fe..62b09f6 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
@@ -329,12 +329,12 @@
 ; CHECK-SPIRV-DAG: %[[#double:]] = OpTypeFloat 64
 
 ; CHECK-SPIRV-DAG: %[[#false:]] = OpConstantFalse %[[#bool]]
+; CHECK-SPIRV-DAG: %[[#int_32:]] = OpConstant %[[#int]] 32
 ; CHECK-SPIRV-DAG: %[[#ScopeSubgroup:]] = OpConstant %[[#int]] 3
 ; CHECK-SPIRV-DAG: %[[#char_0:]] = OpConstant %[[#char]] 0
 ; CHECK-SPIRV-DAG: %[[#char_10:]] = OpConstant %[[#char]] 10
 ; CHECK-SPIRV-DAG: %[[#short_0:]] = OpConstant %[[#short]] 0
 ; CHECK-SPIRV-DAG: %[[#int_0:]] = OpConstant %[[#int]] 0
-; CHECK-SPIRV-DAG: %[[#int_32:]] = OpConstant %[[#int]] 32
 ; CHECK-SPIRV-DAG: %[[#long_0:]] = OpConstantNull %[[#long]]
 ; CHECK-SPIRV-DAG: %[[#half_0:]] = OpConstant %[[#half]] 0
 ; CHECK-SPIRV-DAG: %[[#float_0:]] = OpConstant %[[#float]] 0
diff --git a/llvm/test/CodeGen/SPIRV/unnamed-global.ll b/llvm/test/CodeGen/SPIRV/unnamed-global.ll
index f72334b..90bac50 100644
--- a/llvm/test/CodeGen/SPIRV/unnamed-global.ll
+++ b/llvm/test/CodeGen/SPIRV/unnamed-global.ll
@@ -4,10 +4,10 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[TyInt:.*]] = OpTypeInt 8 0
-; CHECK: %[[ConstInt:.*]] = OpConstant %[[TyInt]] 123
-; CHECK: %[[TyPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt]]
-; CHECK: %[[VarId:.*]] = OpVariable %[[TyPtr]] {{[a-zA-Z]+}} %[[ConstInt]]
+; CHECK-DAG: %[[TyInt:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[ConstInt:.*]] = OpConstant %[[TyInt]] 123
+; CHECK-DAG: %[[TyPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt]]
+; CHECK-DAG: %[[VarId:.*]] = OpVariable %[[TyPtr]] {{[a-zA-Z]+}} %[[ConstInt]]
 
 @0 = addrspace(1) global i8 123
 
diff --git a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
index 5de7afc..786e355 100644
--- a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
+++ b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP
-; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP
+; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP,CHECK-FPNO64
+; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16,+fp64 | FileCheck %s --check-prefixes=CHECK,CHECK-FP,CHECK-FP64
 
 define bfloat @test_fadd(bfloat %a, bfloat %b) {
 ; CHECK-NOFP-LABEL: test_fadd:
@@ -259,9 +260,8 @@ define void @test_truncstore64(double %a, ptr %b) {
 ; CHECK-FP-NEXT:    .save {r4, lr}
 ; CHECK-FP-NEXT:    push {r4, lr}
 ; CHECK-FP-NEXT:    mov r4, r0
-; CHECK-FP-NEXT:    vmov r0, r1, d0
-; CHECK-FP-NEXT:    bl __aeabi_d2f
-; CHECK-FP-NEXT:    lsrs r0, r0, #16
+; CHECK-FP-NEXT:    bl __truncdfbf2
+; CHECK-FP-NEXT:    vmov r0, s0
 ; CHECK-FP-NEXT:    strh r0, [r4]
 ; CHECK-FP-NEXT:    pop {r4, pc}
   %r = fptrunc double %a to bfloat
@@ -312,15 +312,23 @@ define double @test_loadext64(ptr %a) {
 ; CHECK-NOFP-NEXT:    bl __aeabi_f2d
 ; CHECK-NOFP-NEXT:    pop {r7, pc}
 ;
-; CHECK-FP-LABEL: test_loadext64:
-; CHECK-FP:       @ %bb.0:
-; CHECK-FP-NEXT:    .save {r7, lr}
-; CHECK-FP-NEXT:    push {r7, lr}
-; CHECK-FP-NEXT:    ldrh r0, [r0]
-; CHECK-FP-NEXT:    lsls r0, r0, #16
-; CHECK-FP-NEXT:    bl __aeabi_f2d
-; CHECK-FP-NEXT:    vmov d0, r0, r1
-; CHECK-FP-NEXT:    pop {r7, pc}
+; CHECK-FPNO64-LABEL: test_loadext64:
+; CHECK-FPNO64:       @ %bb.0:
+; CHECK-FPNO64-NEXT:    .save {r7, lr}
+; CHECK-FPNO64-NEXT:    push {r7, lr}
+; CHECK-FPNO64-NEXT:    ldrh r0, [r0]
+; CHECK-FPNO64-NEXT:    lsls r0, r0, #16
+; CHECK-FPNO64-NEXT:    bl __aeabi_f2d
+; CHECK-FPNO64-NEXT:    vmov d0, r0, r1
+; CHECK-FPNO64-NEXT:    pop {r7, pc}
+;
+; CHECK-FP64-LABEL: test_loadext64:
+; CHECK-FP64:       @ %bb.0:
+; CHECK-FP64-NEXT:    ldrh r0, [r0]
+; CHECK-FP64-NEXT:    lsls r0, r0, #16
+; CHECK-FP64-NEXT:    vmov s0, r0
+; CHECK-FP64-NEXT:    vcvt.f64.f32 d0, s0
+; CHECK-FP64-NEXT:    bx lr
   %r = load bfloat, ptr %a
   %d = fpext bfloat %r to double
   ret double %d
@@ -1374,10 +1382,7 @@ define bfloat @test_fptrunc_double(double %a) {
 ; CHECK-FP:       @ %bb.0:
 ; CHECK-FP-NEXT:    .save {r7, lr}
 ; CHECK-FP-NEXT:    push {r7, lr}
-; CHECK-FP-NEXT:    vmov r0, r1, d0
-; CHECK-FP-NEXT:    bl __aeabi_d2f
-; CHECK-FP-NEXT:    lsrs r0, r0, #16
-; CHECK-FP-NEXT:    vmov.f16 s0, r0
+; CHECK-FP-NEXT:    bl __truncdfbf2
 ; CHECK-FP-NEXT:    vmov.f16 r0, s0
 ; CHECK-FP-NEXT:    vmov s0, r0
 ; CHECK-FP-NEXT:    pop {r7, pc}
@@ -1410,15 +1415,23 @@ define double @test_fpext_double(bfloat %a) {
 ; CHECK-NOFP-NEXT:    bl __aeabi_f2d
 ; CHECK-NOFP-NEXT:    pop {r7, pc}
 ;
-; CHECK-FP-LABEL: test_fpext_double:
-; CHECK-FP:       @ %bb.0:
-; CHECK-FP-NEXT:    .save {r7, lr}
-; CHECK-FP-NEXT:    push {r7, lr}
-; CHECK-FP-NEXT:    vmov r0, s0
-; CHECK-FP-NEXT:    lsls r0, r0, #16
-; CHECK-FP-NEXT:    bl __aeabi_f2d
-; CHECK-FP-NEXT:    vmov d0, r0, r1
-; CHECK-FP-NEXT:    pop {r7, pc}
+; CHECK-FPNO64-LABEL: test_fpext_double:
+; CHECK-FPNO64:       @ %bb.0:
+; CHECK-FPNO64-NEXT:    .save {r7, lr}
+; CHECK-FPNO64-NEXT:    push {r7, lr}
+; CHECK-FPNO64-NEXT:    vmov r0, s0
+; CHECK-FPNO64-NEXT:    lsls r0, r0, #16
+; CHECK-FPNO64-NEXT:    bl __aeabi_f2d
+; CHECK-FPNO64-NEXT:    vmov d0, r0, r1
+; CHECK-FPNO64-NEXT:    pop {r7, pc}
+;
+; CHECK-FP64-LABEL: test_fpext_double:
+; CHECK-FP64:       @ %bb.0:
+; CHECK-FP64-NEXT:    vmov r0, s0
+; CHECK-FP64-NEXT:    lsls r0, r0, #16
+; CHECK-FP64-NEXT:    vmov s0, r0
+; CHECK-FP64-NEXT:    vcvt.f64.f32 d0, s0
+; CHECK-FP64-NEXT:    bx lr
   %r = fpext bfloat %a to double
   ret double %r
 }
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 93cab25..e63c625 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -14,6 +14,19 @@ define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(ptr noalias nocapture rea
   ret <4 x i32> %wide.masked.gather
 }
 
+define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32_i8(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) {
+; CHECK-LABEL: gather_inc_mini_4i32_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #16
+; CHECK-NEXT:    vadd.i32 q1, q0, r1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, q1]
+; CHECK-NEXT:    bx lr
+  %1 = add <4 x i32> %offs, <i32 16, i32 16, i32 16, i32 16>
+  %2 = getelementptr inbounds i8, i32* %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %wide.masked.gather
+}
+
 define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <4 x i32> %offs) {
 ; CHECK-LABEL: gather_inc_minipred_4i32:
 ; CHECK:       @ %bb.0:
@@ -207,20 +220,20 @@ define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(ptr noalias nocapture
 define arm_aapcs_vfpcc void @gather_pre_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) {
 ; CHECK-LABEL: gather_pre_inc:
 ; CHECK:       @ %bb.0: @ %vector.ph
-; CHECK-NEXT:    adr r3, .LCPI6_0
+; CHECK-NEXT:    adr r3, .LCPI7_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB6_1: @ %vector.body
+; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [q0, #96]!
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vstrb.8 q1, [r1], #16
-; CHECK-NEXT:    bne .LBB6_1
+; CHECK-NEXT:    bne .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:  .LCPI6_0:
+; CHECK-NEXT:  .LCPI7_0:
 ; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
@@ -246,23 +259,65 @@ end:
   ret void;
 }
 
+define arm_aapcs_vfpcc void @gather_pre_inc_i8(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: gather_pre_inc_i8:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r3, .LCPI8_0
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:  .LBB8_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [q0, #24]!
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vstrb.8 q1, [r1], #16
+; CHECK-NEXT:    bne .LBB8_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI8_0:
+; CHECK-NEXT:    .long 4294967278 @ 0xffffffee
+; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
+; CHECK-NEXT:    .long 4294967290 @ 0xfffffffa
+; CHECK-NEXT:    .long 0 @ 0x0
+vector.ph:                                        ; preds = %for.body.preheader
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
+  %2 = getelementptr inbounds i8, ptr %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, ptr %dst, i32 %index
+  store <4 x i32> %wide.masked.gather, ptr %3, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
 define arm_aapcs_vfpcc void @gather_post_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec43) {
 ; CHECK-LABEL: gather_post_inc:
 ; CHECK:       @ %bb.0: @ %vector.ph41
-; CHECK-NEXT:    adr r3, .LCPI7_0
+; CHECK-NEXT:    adr r3, .LCPI9_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB7_1: @ %vector.body39
+; CHECK-NEXT:  .LBB9_1: @ %vector.body39
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [q0, #96]!
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vstrb.8 q1, [r1], #16
-; CHECK-NEXT:    bne .LBB7_1
+; CHECK-NEXT:    bne .LBB9_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:  .LCPI7_0:
+; CHECK-NEXT:  .LCPI9_0:
 ; CHECK-NEXT:    .long 4294967200 @ 0xffffffa0
 ; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
@@ -293,38 +348,38 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB8_1: @ %vector.ph.preheader
+; CHECK-NEXT:  .LBB10_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
 ; CHECK-NEXT:    add.w r4, r3, lr, lsr #2
-; CHECK-NEXT:    adr r3, .LCPI8_0
+; CHECK-NEXT:    adr r3, .LCPI10_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB8_2: @ %vector.ph
+; CHECK-NEXT:  .LBB10_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB8_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB10_3 Depth 2
 ; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:  .LBB8_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB8_2 Depth=1
+; CHECK-NEXT:  .LBB10_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB10_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q2, [q1, #16]!
 ; CHECK-NEXT:    vstrb.8 q2, [r0], #16
-; CHECK-NEXT:    le lr, .LBB8_3
+; CHECK-NEXT:    le lr, .LBB10_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB8_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB10_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB8_2
+; CHECK-NEXT:    bne .LBB10_2
 ; CHECK-NEXT:  @ %bb.5:
 ; CHECK-NEXT:    pop.w {r4, lr}
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI8_0:
+; CHECK-NEXT:  .LCPI10_0:
 ; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
 ; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
 ; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
@@ -363,7 +418,7 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB9_1: @ %vector.ph.preheader
+; CHECK-NEXT:  .LBB11_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -371,26 +426,26 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
-; CHECK-NEXT:    adr r4, .LCPI9_1
-; CHECK-NEXT:    adr r5, .LCPI9_2
+; CHECK-NEXT:    adr r4, .LCPI11_1
+; CHECK-NEXT:    adr r5, .LCPI11_2
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 ; CHECK-NEXT:    add.w r3, r3, lr, lsr #2
-; CHECK-NEXT:    adr.w lr, .LCPI9_0
+; CHECK-NEXT:    adr.w lr, .LCPI11_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [lr]
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:  .LBB9_2: @ %vector.ph
+; CHECK-NEXT:  .LBB11_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB9_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB11_3 Depth 2
 ; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:  .LBB9_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=1
+; CHECK-NEXT:  .LBB11_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q6, [q5, #48]!
 ; CHECK-NEXT:    vldrw.u32 q7, [q3, #48]!
@@ -398,28 +453,28 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vldrw.u32 q7, [q4, #48]!
 ; CHECK-NEXT:    vadd.i32 q6, q6, q7
 ; CHECK-NEXT:    vstrb.8 q6, [r0], #16
-; CHECK-NEXT:    le lr, .LBB9_3
+; CHECK-NEXT:    le lr, .LBB11_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB9_2
+; CHECK-NEXT:    bne .LBB11_2
 ; CHECK-NEXT:  @ %bb.5:
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop.w {r4, r5, r7, lr}
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI9_0:
+; CHECK-NEXT:  .LCPI11_0:
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967260 @ 0xffffffdc
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
 ; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
-; CHECK-NEXT:  .LCPI9_1:
+; CHECK-NEXT:  .LCPI11_1:
 ; CHECK-NEXT:    .long 4294967252 @ 0xffffffd4
 ; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
 ; CHECK-NEXT:    .long 4294967276 @ 0xffffffec
 ; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
-; CHECK-NEXT:  .LCPI9_2:
+; CHECK-NEXT:  .LCPI11_2:
 ; CHECK-NEXT:    .long 4294967256 @ 0xffffffd8
 ; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
 ; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
@@ -467,38 +522,38 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(ptr noalias nocapture readon
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB10_1: @ %vector.ph.preheader
+; CHECK-NEXT:  .LBB12_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
 ; CHECK-NEXT:    add.w r4, r3, lr, lsr #2
-; CHECK-NEXT:    adr r3, .LCPI10_0
+; CHECK-NEXT:    adr r3, .LCPI12_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB10_2: @ %vector.ph
+; CHECK-NEXT:  .LBB12_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB10_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
 ; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:  .LBB10_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB10_2 Depth=1
+; CHECK-NEXT:  .LBB12_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q2, [q1, #508]!
 ; CHECK-NEXT:    vstrb.8 q2, [r0], #16
-; CHECK-NEXT:    le lr, .LBB10_3
+; CHECK-NEXT:    le lr, .LBB12_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB10_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB10_2
+; CHECK-NEXT:    bne .LBB12_2
 ; CHECK-NEXT:  @ %bb.5:
 ; CHECK-NEXT:    pop.w {r4, lr}
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI10_0:
+; CHECK-NEXT:  .LCPI12_0:
 ; CHECK-NEXT:    .long 4294966788 @ 0xfffffe04
 ; CHECK-NEXT:    .long 4294966792 @ 0xfffffe08
 ; CHECK-NEXT:    .long 4294966796 @ 0xfffffe0c
@@ -543,7 +598,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    sub sp, #28
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    blt .LBB11_5
+; CHECK-NEXT:    blt .LBB13_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    movs r6, #1
@@ -553,16 +608,16 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    sub.w r3, r1, #8
 ; CHECK-NEXT:    add.w r8, r6, r3, lsr #3
-; CHECK-NEXT:    adr r3, .LCPI11_0
+; CHECK-NEXT:    adr r3, .LCPI13_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:  .LBB11_2: @ %vector.ph
+; CHECK-NEXT:  .LBB13_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB11_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB13_3 Depth 2
 ; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB11_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
+; CHECK-NEXT:  .LBB13_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB13_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrw.32 q1, [r2]
 ; CHECK-NEXT:    mov r12, r2
@@ -595,19 +650,19 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    vmov.16 q2[6], r7
 ; CHECK-NEXT:    vmov.16 q2[7], r5
 ; CHECK-NEXT:    vstrb.8 q2, [r6], #16
-; CHECK-NEXT:    le lr, .LBB11_3
+; CHECK-NEXT:    le lr, .LBB13_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB13_2 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    bne .LBB11_2
-; CHECK-NEXT:  .LBB11_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne .LBB13_2
+; CHECK-NEXT:  .LBB13_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #28
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI11_0:
+; CHECK-NEXT:  .LCPI13_0:
 ; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:    .short 1 @ 0x1
 ; CHECK-NEXT:    .short 2 @ 0x2
@@ -660,10 +715,10 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    sub sp, #136
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #64] @ 8-byte Folded Spill
-; CHECK-NEXT:    blt.w .LBB12_5
+; CHECK-NEXT:    blt.w .LBB14_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    ldr r1, [sp, #68] @ 4-byte Reload
-; CHECK-NEXT:    adr r3, .LCPI12_2
+; CHECK-NEXT:    adr r3, .LCPI14_2
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    bic r1, r1, #7
@@ -673,17 +728,17 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vmov.i16 q2, #0x18
 ; CHECK-NEXT:    add.w r1, r2, r1, lsr #3
 ; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
-; CHECK-NEXT:    adr r1, .LCPI12_0
-; CHECK-NEXT:    adr r2, .LCPI12_1
+; CHECK-NEXT:    adr r1, .LCPI14_0
+; CHECK-NEXT:    adr r2, .LCPI14_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #24] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:    add r2, sp, #120
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #8] @ 16-byte Spill
-; CHECK-NEXT:  .LBB12_2: @ %vector.ph
+; CHECK-NEXT:  .LBB14_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r10, sp, #104
 ; CHECK-NEXT:    dls lr, r1
@@ -691,8 +746,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #24] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp, #40] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #8] @ 16-byte Reload
-; CHECK-NEXT:  .LBB12_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
+; CHECK-NEXT:  .LBB14_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB14_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrw.32 q5, [r2]
 ; CHECK-NEXT:    mov r8, r2
@@ -786,21 +841,21 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vadd.i16 q0, q3, q1
 ; CHECK-NEXT:    vadd.i16 q0, q0, q7
 ; CHECK-NEXT:    vstrb.8 q0, [r7], #16
-; CHECK-NEXT:    le lr, .LBB12_3
+; CHECK-NEXT:    le lr, .LBB14_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB14_2 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r3, [sp, #68] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r1, r3
-; CHECK-NEXT:    bne.w .LBB12_2
-; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne.w .LBB14_2
+; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #136
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI12_0:
+; CHECK-NEXT:  .LCPI14_0:
 ; CHECK-NEXT:    .short 1 @ 0x1
 ; CHECK-NEXT:    .short 4 @ 0x4
 ; CHECK-NEXT:    .short 7 @ 0x7
@@ -809,7 +864,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    .short 16 @ 0x10
 ; CHECK-NEXT:    .short 19 @ 0x13
 ; CHECK-NEXT:    .short 22 @ 0x16
-; CHECK-NEXT:  .LCPI12_1:
+; CHECK-NEXT:  .LCPI14_1:
 ; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:    .short 3 @ 0x3
 ; CHECK-NEXT:    .short 6 @ 0x6
@@ -818,7 +873,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    .short 15 @ 0xf
 ; CHECK-NEXT:    .short 18 @ 0x12
 ; CHECK-NEXT:    .short 21 @ 0x15
-; CHECK-NEXT:  .LCPI12_2:
+; CHECK-NEXT:  .LCPI14_2:
 ; CHECK-NEXT:    .short 2 @ 0x2
 ; CHECK-NEXT:    .short 5 @ 0x5
 ; CHECK-NEXT:    .short 8 @ 0x8
@@ -880,21 +935,21 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    sub sp, #312
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    str r1, [sp, #116] @ 4-byte Spill
-; CHECK-NEXT:    blt.w .LBB13_5
+; CHECK-NEXT:    blt.w .LBB15_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    adr r1, .LCPI13_0
-; CHECK-NEXT:    adr r6, .LCPI13_8
+; CHECK-NEXT:    adr r1, .LCPI15_0
+; CHECK-NEXT:    adr r6, .LCPI15_8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_1
-; CHECK-NEXT:    adr r7, .LCPI13_7
-; CHECK-NEXT:    adr r3, .LCPI13_6
+; CHECK-NEXT:    adr r1, .LCPI15_1
+; CHECK-NEXT:    adr r7, .LCPI15_7
+; CHECK-NEXT:    adr r3, .LCPI15_6
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_5
+; CHECK-NEXT:    adr r1, .LCPI15_5
 ; CHECK-NEXT:    bic r10, r2, #7
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    adr r6, .LCPI13_9
+; CHECK-NEXT:    adr r6, .LCPI15_9
 ; CHECK-NEXT:    vmov.i32 q2, #0x30
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
@@ -905,22 +960,22 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:  .LBB13_2: @ %vector.ph
+; CHECK-NEXT:  .LBB15_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB13_3 Depth 2
-; CHECK-NEXT:    adr r1, .LCPI13_3
+; CHECK-NEXT:    @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT:    adr r1, .LCPI15_3
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_4
+; CHECK-NEXT:    adr r1, .LCPI15_4
 ; CHECK-NEXT:    vldrw.u32 q5, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_2
+; CHECK-NEXT:    adr r1, .LCPI15_2
 ; CHECK-NEXT:    vldrw.u32 q3, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_10
+; CHECK-NEXT:    adr r1, .LCPI15_10
 ; CHECK-NEXT:    vstrw.32 q6, [sp, #280] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_11
+; CHECK-NEXT:    adr r1, .LCPI15_11
 ; CHECK-NEXT:    ldr.w r8, [sp, #116] @ 4-byte Reload
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #248] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
@@ -935,8 +990,8 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    mov r11, r10
 ; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #184] @ 16-byte Spill
-; CHECK-NEXT:  .LBB13_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB13_2 Depth=1
+; CHECK-NEXT:  .LBB15_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB15_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vadd.i32 q4, q1, r0
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #136] @ 16-byte Spill
@@ -1114,74 +1169,74 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #168] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.i32 q7, q7, q2
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
-; CHECK-NEXT:    bne.w .LBB13_3
+; CHECK-NEXT:    bne.w .LBB15_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB13_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB15_2 Depth=1
 ; CHECK-NEXT:    cmp r10, r2
-; CHECK-NEXT:    bne.w .LBB13_2
-; CHECK-NEXT:  .LBB13_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne.w .LBB15_2
+; CHECK-NEXT:  .LBB15_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #312
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI13_0:
+; CHECK-NEXT:  .LCPI15_0:
 ; CHECK-NEXT:    .long 38 @ 0x26
 ; CHECK-NEXT:    .long 41 @ 0x29
 ; CHECK-NEXT:    .long 44 @ 0x2c
 ; CHECK-NEXT:    .long 47 @ 0x2f
-; CHECK-NEXT:  .LCPI13_1:
+; CHECK-NEXT:  .LCPI15_1:
 ; CHECK-NEXT:    .long 14 @ 0xe
 ; CHECK-NEXT:    .long 17 @ 0x11
 ; CHECK-NEXT:    .long 20 @ 0x14
 ; CHECK-NEXT:    .long 23 @ 0x17
-; CHECK-NEXT:  .LCPI13_2:
+; CHECK-NEXT:  .LCPI15_2:
 ; CHECK-NEXT:    .long 24 @ 0x18
 ; CHECK-NEXT:    .long 27 @ 0x1b
 ; CHECK-NEXT:    .long 30 @ 0x1e
 ; CHECK-NEXT:    .long 33 @ 0x21
-; CHECK-NEXT:  .LCPI13_3:
+; CHECK-NEXT:  .LCPI15_3:
 ; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 4 @ 0x4
 ; CHECK-NEXT:    .long 7 @ 0x7
 ; CHECK-NEXT:    .long 10 @ 0xa
-; CHECK-NEXT:  .LCPI13_4:
+; CHECK-NEXT:  .LCPI15_4:
 ; CHECK-NEXT:    .long 36 @ 0x24
 ; CHECK-NEXT:    .long 39 @ 0x27
 ; CHECK-NEXT:    .long 42 @ 0x2a
 ; CHECK-NEXT:    .long 45 @ 0x2d
-; CHECK-NEXT:  .LCPI13_5:
+; CHECK-NEXT:  .LCPI15_5:
 ; CHECK-NEXT:    .long 25 @ 0x19
 ; CHECK-NEXT:    .long 28 @ 0x1c
 ; CHECK-NEXT:    .long 31 @ 0x1f
 ; CHECK-NEXT:    .long 34 @ 0x22
-; CHECK-NEXT:  .LCPI13_6:
+; CHECK-NEXT:  .LCPI15_6:
 ; CHECK-NEXT:    .long 13 @ 0xd
 ; CHECK-NEXT:    .long 16 @ 0x10
 ; CHECK-NEXT:    .long 19 @ 0x13
 ; CHECK-NEXT:    .long 22 @ 0x16
-; CHECK-NEXT:  .LCPI13_7:
+; CHECK-NEXT:  .LCPI15_7:
 ; CHECK-NEXT:    .long 2 @ 0x2
 ; CHECK-NEXT:    .long 5 @ 0x5
 ; CHECK-NEXT:    .long 8 @ 0x8
 ; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:  .LCPI13_8:
+; CHECK-NEXT:  .LCPI15_8:
 ; CHECK-NEXT:    .long 26 @ 0x1a
 ; CHECK-NEXT:    .long 29 @ 0x1d
 ; CHECK-NEXT:    .long 32 @ 0x20
 ; CHECK-NEXT:    .long 35 @ 0x23
-; CHECK-NEXT:  .LCPI13_9:
+; CHECK-NEXT:  .LCPI15_9:
 ; CHECK-NEXT:    .long 37 @ 0x25
 ; CHECK-NEXT:    .long 40 @ 0x28
 ; CHECK-NEXT:    .long 43 @ 0x2b
 ; CHECK-NEXT:    .long 46 @ 0x2e
-; CHECK-NEXT:  .LCPI13_10:
+; CHECK-NEXT:  .LCPI15_10:
 ; CHECK-NEXT:    .long 12 @ 0xc
 ; CHECK-NEXT:    .long 15 @ 0xf
 ; CHECK-NEXT:    .long 18 @ 0x12
 ; CHECK-NEXT:    .long 21 @ 0x15
-; CHECK-NEXT:  .LCPI13_11:
+; CHECK-NEXT:  .LCPI15_11:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 3 @ 0x3
 ; CHECK-NEXT:    .long 6 @ 0x6
@@ -1238,14 +1293,14 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #56] @ 8-byte Folded Spill
-; CHECK-NEXT:    blt.w .LBB14_5
+; CHECK-NEXT:    blt.w .LBB16_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    adr r5, .LCPI14_3
-; CHECK-NEXT:    adr r7, .LCPI14_1
+; CHECK-NEXT:    adr r5, .LCPI16_3
+; CHECK-NEXT:    adr r7, .LCPI16_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    adr r3, .LCPI14_0
-; CHECK-NEXT:    adr r6, .LCPI14_2
+; CHECK-NEXT:    adr r3, .LCPI16_0
+; CHECK-NEXT:    adr r6, .LCPI16_2
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    bic r9, r1, #7
@@ -1255,16 +1310,16 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    mov.w lr, #16
 ; CHECK-NEXT:    str.w r9, [sp, #52] @ 4-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:  .LBB14_2: @ %vector.ph
+; CHECK-NEXT:  .LBB16_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_3 Depth 2
 ; CHECK-NEXT:    ldr.w r8, [sp, #56] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov q4, q3
-; CHECK-NEXT:  .LBB14_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB14_2 Depth=1
+; CHECK-NEXT:  .LBB16_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB16_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vadd.i32 q1, q5, r0
 ; CHECK-NEXT:    vadd.i32 q2, q4, r0
@@ -1318,36 +1373,36 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    vmov.8 q0[15], r12
 ; CHECK-NEXT:    vstrb.8 q0, [r8], #16
 ; CHECK-NEXT:    vmov q0, q6
-; CHECK-NEXT:    bne .LBB14_3
+; CHECK-NEXT:    bne .LBB16_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB14_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB16_2 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
 ; CHECK-NEXT:    ldr.w r9, [sp, #52] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r9, r1
-; CHECK-NEXT:    bne .LBB14_2
-; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne .LBB16_2
+; CHECK-NEXT:  .LBB16_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI14_0:
+; CHECK-NEXT:  .LCPI16_0:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 2 @ 0x2
 ; CHECK-NEXT:    .long 3 @ 0x3
-; CHECK-NEXT:  .LCPI14_1:
+; CHECK-NEXT:  .LCPI16_1:
 ; CHECK-NEXT:    .long 8 @ 0x8
 ; CHECK-NEXT:    .long 9 @ 0x9
 ; CHECK-NEXT:    .long 10 @ 0xa
 ; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:  .LCPI14_2:
+; CHECK-NEXT:  .LCPI16_2:
 ; CHECK-NEXT:    .long 4 @ 0x4
 ; CHECK-NEXT:    .long 5 @ 0x5
 ; CHECK-NEXT:    .long 6 @ 0x6
 ; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:  .LCPI14_3:
+; CHECK-NEXT:  .LCPI16_3:
 ; CHECK-NEXT:    .long 12 @ 0xc
 ; CHECK-NEXT:    .long 13 @ 0xd
 ; CHECK-NEXT:    .long 14 @ 0xe
@@ -1390,21 +1445,21 @@ define void @shl(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) {
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
-; CHECK-NEXT:  .LBB15_1: @ %vector.ph
-; CHECK-NEXT:    adr r3, .LCPI15_0
+; CHECK-NEXT:  .LBB17_1: @ %vector.ph
+; CHECK-NEXT:    adr r3, .LCPI17_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK-NEXT:  .LBB15_2: @ %vector.body
+; CHECK-NEXT:  .LBB17_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [q0, #64]!
 ; CHECK-NEXT:    vstrw.32 q1, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB15_2
+; CHECK-NEXT:    letp lr, .LBB17_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
-; CHECK-NEXT:  .LCPI15_0:
+; CHECK-NEXT:  .LCPI17_0:
 ; CHECK-NEXT:    .long 4294967232 @ 0xffffffc0
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
@@ -1444,12 +1499,12 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB16_3
+; CHECK-NEXT:    blt .LBB18_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adr.w lr, .LCPI16_0
-; CHECK-NEXT:    adr r4, .LCPI16_1
-; CHECK-NEXT:    adr r5, .LCPI16_2
-; CHECK-NEXT:    adr r6, .LCPI16_3
+; CHECK-NEXT:    adr.w lr, .LCPI18_0
+; CHECK-NEXT:    adr r4, .LCPI18_1
+; CHECK-NEXT:    adr r5, .LCPI18_2
+; CHECK-NEXT:    adr r6, .LCPI18_3
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
@@ -1459,7 +1514,7 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    vadd.i32 q2, q2, r1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r1
 ; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK-NEXT:  .LBB16_2: @ %vector.body
+; CHECK-NEXT:  .LBB18_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q4, [q3, #128]!
 ; CHECK-NEXT:    vldrw.u32 q5, [q2, #128]!
@@ -1469,28 +1524,28 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    vadd.i32 q4, q4, q5
 ; CHECK-NEXT:    vadd.i32 q4, q4, q6
 ; CHECK-NEXT:    vstrw.32 q4, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB16_2
-; CHECK-NEXT:  .LBB16_3: @ %for.cond.cleanup
+; CHECK-NEXT:    letp lr, .LBB18_2
+; CHECK-NEXT:  .LBB18_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
-; CHECK-NEXT:  .LCPI16_0:
+; CHECK-NEXT:  .LCPI18_0:
 ; CHECK-NEXT:    .long 4294967168 @ 0xffffff80
 ; CHECK-NEXT:    .long 4294967200 @ 0xffffffa0
 ; CHECK-NEXT:    .long 4294967232 @ 0xffffffc0
 ; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
-; CHECK-NEXT:  .LCPI16_1:
+; CHECK-NEXT:  .LCPI18_1:
 ; CHECK-NEXT:    .long 4294967176 @ 0xffffff88
 ; CHECK-NEXT:    .long 4294967208 @ 0xffffffa8
 ; CHECK-NEXT:    .long 4294967240 @ 0xffffffc8
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
-; CHECK-NEXT:  .LCPI16_2:
+; CHECK-NEXT:  .LCPI18_2:
 ; CHECK-NEXT:    .long 4294967184 @ 0xffffff90
 ; CHECK-NEXT:    .long 4294967216 @ 0xffffffb0
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
-; CHECK-NEXT:  .LCPI16_3:
+; CHECK-NEXT:  .LCPI18_3:
 ; CHECK-NEXT:    .long 4294967192 @ 0xffffff98
 ; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
 ; CHECK-NEXT:    .long 4294967256 @ 0xffffffd8
diff --git a/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll b/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
index 67fd59e..ed3dcad 100644
--- a/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -30,9 +30,9 @@ entry:
   ret float %conv
 }
 
-define void @PR17495() {
+define void @PR17495(i1 %arg) {
 entry:
-  br i1 undef, label %while.end, label %while.body
+  br i1 %arg, label %while.end, label %while.body
 
 while.body:                                       ; preds = %while.body, %entry
   %x.1.copyload = load i24, ptr undef, align 1
diff --git a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
index 22bf458..49de509 100644
--- a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
+++ b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
@@ -165,7 +165,7 @@ failure:                                          ; preds = %backedge
   unreachable
 }
 
-define void @test_04() {
+define void @test_04(i32 %arg) {
 ; CHECK-LABEL: test_04:
 ; CHECK:       ## %bb.0: ## %bb
 ; CHECK-NEXT:    ud2
@@ -175,7 +175,7 @@ bb:
 bb1:                                              ; preds = %bb10, %bb
   %tmp = phi i64 [ 1, %bb ], [ %tmp2, %bb10 ]
   %tmp2 = add nuw nsw i64 %tmp, 1
-  br i1 undef, label %bb21, label %bb7
+  br i1 poison, label %bb21, label %bb7
 
 bb7:                                              ; preds = %bb1
   %tmp8 = add nsw i64 %tmp, -1
@@ -187,7 +187,7 @@ bb10:                                             ; preds = %bb16
   br label %bb1
 
 bb11:                                             ; preds = %bb16, %bb7
-  switch i32 undef, label %bb19 [
+  switch i32 %arg, label %bb19 [
     i32 0, label %bb17
     i32 1, label %bb16
     i32 2, label %bb15
@@ -205,7 +205,7 @@ bb15:                                             ; preds = %bb11
   unreachable
 
 bb16:                                             ; preds = %bb11
-  br i1 undef, label %bb10, label %bb11
+  br i1 poison, label %bb10, label %bb11
 
 bb17:                                             ; preds = %bb11
   unreachable
diff --git a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
index 86874b1..faa119c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
@@ -5,13 +5,13 @@ define void @undef_2phi(ptr%buf) {
 ; CHECK-LABEL: @undef_2phi(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[L3:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[L3:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
-; CHECK-NEXT:    br i1 undef, label [[L3]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L3]], label [[EXIT:%.*]]
 ; CHECK:       l3:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi x86_amx [ [[TMP1]], [[L2]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP2]])
@@ -20,16 +20,16 @@ define void @undef_2phi(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %l3
+  br i1 poison, label %l2, label %l3
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
-  br i1 undef, label %l3, label %exit
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
+  br i1 poison, label %l3, label %exit
 
 l3:
   %t4 = phi <256 x i32> [ %t3, %l2], [ %t2, %l1 ]
@@ -45,10 +45,10 @@ define void @foo_undef(ptr%buf) {
 ; CHECK-LABEL: @foo_undef(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]])
@@ -57,15 +57,15 @@ define void @foo_undef(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t4)
   br label %exit
@@ -78,10 +78,10 @@ define void @foo_zero(ptr%buf) {
 ; CHECK-LABEL: @foo_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]])
@@ -90,12 +90,12 @@ define void @foo_zero(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ]
@@ -112,14 +112,14 @@ define void @foo_vrow(ptr%buf, i16 %row) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW:%.*]], i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[TMP1]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[TMP0]], align 1024
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 32, ptr [[TMP0]], i64 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP5]])
@@ -128,15 +128,15 @@ define void @foo_vrow(ptr%buf, i16 %row) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   call void @llvm.x86.tilestored64.internal(i16 %row, i16 32, ptr %buf, i64 1024, x86_amx %t4)
   br label %exit
@@ -150,13 +150,13 @@ define void @foo_vcol(ptr%buf, i16 %col) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 [[COL:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[COL]] to i64
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[TMP1]], i64 [[TMP3]], x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP4]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[TMP0]], align 1024
@@ -168,12 +168,12 @@ define void @foo_vcol(ptr%buf, i16 %col) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 %col)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ]
@@ -189,29 +189,29 @@ define void @noshape(ptr%buf) {
 ; CHECK-LABEL: @noshape(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4)
   store <256 x i32> %t5, ptr %buf
@@ -225,14 +225,14 @@ define void @noshape2(ptr%buf) {
 ; CHECK-LABEL: @noshape2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
 ; CHECK-NEXT:    [[T6:%.*]] = call <256 x i32> @llvm.abs.v256i32(<256 x i32> [[T3]], i1 true)
 ; CHECK-NEXT:    store <256 x i32> [[T6]], ptr [[BUF:%.*]], align 1024
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -240,15 +240,15 @@ define void @noshape2(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4)
   %t6 = call <256 x i32> @llvm.abs.v256i32(<256 x i32> %t5, i1 1)
diff --git a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
index b2eb5fd..b70668f 100644
--- a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
@@ -18,14 +18,14 @@ wrapper_entry:
 
 ; Cases where amxcast can be combined across bb
 ; %5 and %6 is combined together since %goodphi's incoming is phi or amxcast
-define void @combine_amx_cast_and_phi() {
+define void @combine_amx_cast_and_phi(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -43,7 +43,7 @@ define void @combine_amx_cast_and_phi() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -62,7 +62,7 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 
 ; Cases where amxcast can't be combined across bb
 ; %5 and %6 is not combined together since %evilphi's incoming is not phi or amxcast
-define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
+define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp, i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -71,7 +71,7 @@ define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <110 x i32> [[TMP:%.*]], [[TMP]]
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP4]], align 512
 ; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40)
@@ -92,7 +92,7 @@ define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
 ;
 wrapper_entry:
   %0 = add <110 x i32> %tmp, %tmp
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -111,7 +111,7 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 
 ; Cases where amxcast can't be combined across bb
 ; %5 and %6 is not combined together since %evilphi's user aka %evilphi2 is not inside phi web.
-define void @fail_to_combine_amx_cast_and_phi2() {
+define void @fail_to_combine_amx_cast_and_phi2(i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi2(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -123,7 +123,7 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP5]], i64 40, x86_amx [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <110 x i32>, ptr [[TMP5]], align 512
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP4]], align 512
 ; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40)
@@ -134,13 +134,13 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 ; CHECK-NEXT:    [[TMP15:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP10]], x86_amx [[TMP12]], x86_amx [[TMP14]])
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, x86_amx [[TMP15]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <110 x i32>, ptr [[TMP1]], align 512
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
 ; CHECK:       for.cond.cleanup.i.i:
 ; CHECK-NEXT:    [[GOODPHI:%.*]] = phi <110 x i32> [ [[TMP8]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    store <110 x i32> [[GOODPHI]], ptr [[TMP0]], align 512
 ; CHECK-NEXT:    [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP19]])
-; CHECK-NEXT:    br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[EVILPHI2:%.*]] = phi <110 x i32> [ [[GOODPHI]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    store <110 x i32> [[EVILPHI2]], ptr undef, align 512
@@ -149,7 +149,7 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -157,27 +157,27 @@ for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %for.cond.cleanup.i.i, label %exit
+  br i1 %arg, label %for.cond.cleanup.i.i, label %exit
 
 for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
   %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
   %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
-  br i1 undef, label %exit, label %for.body.i.lr.ph.i
+  br i1 %arg, label %exit, label %for.body.i.lr.ph.i
 exit:
   %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
   store <110 x i32> %evilphi2, ptr undef, align 512
   ret void
 }
 
-define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() {
+define void @fail_to_combine_amx_cast_and_phi_due_to_const_value(i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi_due_to_const_value(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 11, i16 40)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -193,7 +193,7 @@ define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() {
 ; CHECK-NEXT:    ret void
 ;
 wrapper_entry:
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %0 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -213,14 +213,14 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 ; Cases where amxcast can be combined across bb
 ; When optimizeAMXCastFromPhi process %6 and %goodphi, %goodphi2 is outside the phi-web, so the optimization stop
 ; When optimizeAMXCastFromPhi process %7 and %goodphi2, the optimization continue.
-define void @combine_amx_cast_and_multiple_phi() {
+define void @combine_amx_cast_and_multiple_phi(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_multiple_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -229,11 +229,11 @@ define void @combine_amx_cast_and_multiple_phi() {
 ; CHECK-NEXT:    store <560 x i8> undef, ptr [[TMP0]], align 1024
 ; CHECK-NEXT:    [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40)
 ; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]])
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
 ; CHECK:       for.cond.cleanup.i.i:
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]])
-; CHECK-NEXT:    br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi x86_amx [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP12]])
@@ -242,7 +242,7 @@ define void @combine_amx_cast_and_multiple_phi() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -250,13 +250,13 @@ for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %for.cond.cleanup.i.i, label %exit
+  br i1 %arg, label %for.cond.cleanup.i.i, label %exit
 
 for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
   %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
   %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
-  br i1 undef, label %exit, label %for.body.i.lr.ph.i
+  br i1 %arg, label %exit, label %for.body.i.lr.ph.i
 exit:
   %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
   %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
@@ -265,7 +265,7 @@ exit:
 }
 
 ; Currently we are not able to delete DeadPHICycle, later we will handle with them
-define void @combine_amx_cast_and_phi_in_a_circle() {
+define void @combine_amx_cast_and_phi_in_a_circle(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_phi_in_a_circle(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -284,7 +284,7 @@ define void @combine_amx_cast_and_phi_in_a_circle() {
 ; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]])
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40, x86_amx [[TMP11]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <110 x i32>, ptr [[TMP0]], align 512
-; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi x86_amx [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ]
 ; CHECK-NEXT:    [[GOODPHI:%.*]] = phi <110 x i32> [ [[EVILPHI2:%.*]], [[BB3]] ], [ [[TMP13]], [[BB1]] ]
@@ -294,7 +294,7 @@ define void @combine_amx_cast_and_phi_in_a_circle() {
 ; CHECK-NEXT:    [[TMP15]] = phi x86_amx [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ]
 ; CHECK-NEXT:    [[EVILPHI2]] = phi <110 x i32> [ [[GOODPHI]], [[BB2]] ], [ [[TMP13]], [[BB1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]])
-; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[BB2]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]])
 ; CHECK-NEXT:    ret void
@@ -310,7 +310,7 @@ bb1:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %bb2, label %bb3
+  br i1 %arg, label %bb2, label %bb3
 
 bb2:                             ; preds = %bb1, %wrapper_entry
   %goodphi = phi <110 x i32> [ %evilphi2, %bb3], [ %5, %bb1 ]
@@ -321,19 +321,19 @@ bb3:
   %evilphi2 = phi <110 x i32> [ %goodphi, %bb2 ], [ %5, %bb1 ]
   %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %7)
-  br i1 undef, label %bb2, label %exit
+  br i1 %arg, label %bb2, label %exit
 exit:
   %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %8)
   ret void
 }
 
-define void @eliminate_unused_phi_and_cast() {
+define void @eliminate_unused_phi_and_cast(i1 %arg) {
 ; CHECK-LABEL: @eliminate_unused_phi_and_cast(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef)
@@ -349,7 +349,7 @@ define void @eliminate_unused_phi_and_cast() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef)
diff --git a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
index 391727d54..3a5b4245 100644
--- a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
@@ -317,16 +317,16 @@ define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byva
   ret void
 }
 
-define void @dead_code(ptr%buf) {
+define void @dead_code(ptr%buf, i1 %arg) {
 ; CHECK-LABEL: @dead_code(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP1]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024
@@ -335,12 +335,12 @@ define void @dead_code(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 %arg, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 %arg, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
diff --git a/llvm/test/CodeGen/X86/StackColoring.ll b/llvm/test/CodeGen/X86/StackColoring.ll
index 389d024..db3e7dc 100644
--- a/llvm/test/CodeGen/X86/StackColoring.ll
+++ b/llvm/test/CodeGen/X86/StackColoring.ll
@@ -135,7 +135,7 @@ entry:
   %t3 = call i32 @foo(i32 %in, ptr %a3)
   %t4 = call i32 @foo(i32 %in, ptr %a3)
   call void @llvm.lifetime.end.p0(i64 -1, ptr %a3)
-  br i1 undef, label %bb2, label %bb3
+  br i1 poison, label %bb2, label %bb3
 bb2:
   call void @llvm.lifetime.start.p0(i64 -1, ptr %a4)
   %t11 = call i32 @foo(i32 %in, ptr %a4)
diff --git a/llvm/test/CodeGen/X86/asm-label.ll b/llvm/test/CodeGen/X86/asm-label.ll
index 05c37db..2d3e7b6 100644
--- a/llvm/test/CodeGen/X86/asm-label.ll
+++ b/llvm/test/CodeGen/X86/asm-label.ll
@@ -12,15 +12,15 @@
 ; SAVETEMP:         jne {{.*}} <.LBB0_1>
 ; SAVETEMP-LABEL: <.LBB0_1>:
 
-define void @foo()  {
+define void @foo(i1 %arg, i32 %arg2)  {
 entry:
-  br i1 undef, label %land.lhs.true, label %if.end11
+  br i1 %arg, label %land.lhs.true, label %if.end11
 
 land.lhs.true:                                    ; preds = %entry
-  br i1 undef, label %if.then, label %if.end11
+  br i1 %arg, label %if.then, label %if.end11
 
 if.then:                                          ; preds = %land.lhs.true
-  br i1 undef, label %if.then9, label %if.end
+  br i1 %arg, label %if.then9, label %if.end
 
 if.then9:                                         ; preds = %if.then
   br label %cleanup
@@ -29,7 +29,7 @@ if.end:                                           ; preds = %if.then
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end, %if.then9
-  switch i32 undef, label %default [
+  switch i32 %arg2, label %default [
     i32 0, label %cleanup.cont
     i32 1, label %if.end11
   ]
diff --git a/llvm/test/CodeGen/X86/avx-select.ll b/llvm/test/CodeGen/X86/avx-select.ll
index 7a33daf..1b688c8 100644
--- a/llvm/test/CodeGen/X86/avx-select.ll
+++ b/llvm/test/CodeGen/X86/avx-select.ll
@@ -84,7 +84,7 @@ head:
   %isneg = icmp slt <4 x i32> %v3, zeroinitializer
   %or0 = select <4 x i1> %isneg, <4 x i32> <i32 26146, i32 -1257, i32 -2, i32 -3052>, <4 x i32> <i32 -24947, i32 7802, i32 29242, i32 15858>
   %or1 = shufflevector <4 x i32> %or0, <4 x i32> <i32 29361, i32 -16094, i32 -3080, i32 -26286>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  br i1 undef, label %exit, label %head
+  br i1 poison, label %exit, label %head
 
 exit:
   store <8 x i32> %or1, ptr addrspace(1) undef, align 32
diff --git a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
new file mode 100644
index 0000000..d7ad7b0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
+
+; VCVTTPD2DQS
+define <8 x i32> @test_signed_v8i32_v8f64(<8 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i32_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2dqs %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> %f)
+  ret <8 x i32> %x
+}
+
+; VCVTTPD2QQS
+define <8 x i64> @test_signed_v8i64_v8f64(<8 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i64_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2qqs %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> %f)
+  ret <8 x i64> %x
+}
+
+; VCVTTPD2UDQS
+define <8 x i32> @test_unsigned_v8i32_v8f64(<8 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i32_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2udqs %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> %f)
+ ret <8 x i32> %x
+}
+
+; VCVTTPD2UQQS
+define <8 x i64> @test_unsigned_v8i64_v8f64(<8 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i64_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2uqqs %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> %f)
+  ret <8 x i64> %x
+}
+
+; VCVTTPS2DQS
+define <16 x i32> @test_signed_v16i32_v16f32(<16 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v16i32_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2dqs %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> %f)
+  ret <16 x i32> %x
+}
+
+; VCVTTPS2UDQS
+define <16 x i32> @test_unsigned_v16i32_v16f32(<16 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v16i32_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2udqs %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> %f)
+  ret <16 x i32> %x
+}
+; VCVTTPS2QQS
+define <8 x i64> @test_signed_v8i64_v8f32(<8 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i64_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2qqs %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> %f)
+  ret <8 x i64> %x
+}
+
+; VCVTTPS2UQQS
+define <8 x i64> @test_unsigned_v8i64_v8f32(<8 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i64_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2uqqs %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> %f)
+  ret <8 x i64> %x
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64: {{.*}}
+; X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
index 494e4bc..a2f167e 100644
--- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
 
 ;
 ; 32-bit float to signed integer
@@ -112,3 +112,157 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
     %x = call i64 @llvm.fptosi.sat.i64.f64(double %f)
     ret i64 %x
 }
+
+; VCVTTPD2DQS
+define <2 x i32> @test_signed_v2i32_v2f64(<2 x double> %d) nounwind {
+; CHECK-LABEL: test_signed_v2i32_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2dqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %d)
+  ret <2 x i32> %x
+}
+
+define <4 x i32> @test_signed_v4i32_v4f64(<4 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v4i32_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2dqs %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> %f)
+  ret <4 x i32> %x
+}
+
+; VCVTTPD2QQS
+define <2 x i64> @test_signed_v2i64_v2f64(<2 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v2i64_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2qqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+    %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> %f)
+    ret <2 x i64> %x
+}
+
+define <4 x i64> @test_signed_v4i64_v4f64(<4 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v4i64_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2qqs %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> %f)
+  ret <4 x i64> %x
+}
+
+; VCVTTPD2UDQS
+define <2 x i32> @test_unsigned_v2i32_v2f64(<2 x double> %d) nounwind {
+; CHECK-LABEL: test_unsigned_v2i32_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2udqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> %d)
+  ret <2 x i32> %x
+}
+
+define <4 x i32> @test_unsigned_v4i32_v4f64(<4 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v4i32_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2udqs %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> %f)
+  ret <4 x i32> %x
+}
+
+; VCVTTPD2UQQS
+define <2 x i64> @test_unsigned_v2i64_v2f64(<2 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v2i64_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2uqqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> %f)
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @test_unsigned_v4i64_v4f64(<4 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v4i64_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2uqqs %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> %f)
+  ret <4 x i64> %x
+}
+
+; VCVTTPS2DQS
+define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v4i32_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2dqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %f)
+  ret <4 x i32> %x
+}
+
+define <8 x i32> @test_signed_v8i32_v8f32(<8 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i32_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2dqs %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> %f)
+  ret <8 x i32> %x
+}
+
+; VCVTTPS2UDQS
+define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v4i32_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2udqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> %f)
+  ret <4 x i32> %x
+}
+
+define <8 x i32> @test_unsigned_v8i32_v8f32(<8 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i32_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2udqs %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> %f)
+  ret <8 x i32> %x
+}
+
+; VCVTTPS2QQS
+define <2 x i64> @test_signed_v2i64_v2f32(<2 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v2i64_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2qqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> %f)
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @test_signed_v4i64_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v4i64_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2qqs %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> %f)
+  ret <4 x i64> %x
+}
+
+; VCVTTPS2UQQS
+define <2 x i64> @test_unsigned_v2i64_v2f32(<2 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v2i64_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2uqqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> %f)
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v4i64_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2uqqs %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f)
+  ret <4 x i64> %x
+}
diff --git a/llvm/test/CodeGen/X86/avx512-i1test.ll b/llvm/test/CodeGen/X86/avx512-i1test.ll
index 3cd7331..d8683df 100644
--- a/llvm/test/CodeGen/X86/avx512-i1test.ll
+++ b/llvm/test/CodeGen/X86/avx512-i1test.ll
@@ -21,20 +21,20 @@ define void @func() {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jmp .LBB0_2
 bb1:
-  br i1 undef, label %L_10, label %L_10
+  br i1 poison, label %L_10, label %L_10
 
 L_10:                                             ; preds = %bb1, %bb1
-  br i1 undef, label %L_30, label %bb56
+  br i1 poison, label %L_30, label %bb56
 
 bb56:                                             ; preds = %L_10
   br label %bb33
 
 bb33:                                             ; preds = %bb51, %bb56
   %r111 = load i64, ptr undef, align 8
-  br i1 undef, label %bb51, label %bb35
+  br i1 poison, label %bb51, label %bb35
 
 bb35:                                             ; preds = %bb33
-  br i1 undef, label %L_19, label %bb37
+  br i1 poison, label %L_19, label %bb37
 
 bb37:                                             ; preds = %bb35
   %r128 = and i64 %r111, 576460752303423488
@@ -43,7 +43,7 @@ bb37:                                             ; preds = %bb35
 
 L_19:                                             ; preds = %bb37, %bb35
   %"$V_S25.0" = phi i1 [ %phitmp, %bb37 ], [ true, %bb35 ]
-  br i1 undef, label %bb51, label %bb42
+  br i1 poison, label %bb51, label %bb42
 
 bb42:                                             ; preds = %L_19
   %r136 = select i1 %"$V_S25.0", ptr undef, ptr undef
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
index 55b86ca..9db57fe 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
@@ -5,6 +5,10 @@ declare half @llvm.minimum.f16(half, half)
 declare half @llvm.maximum.f16(half, half)
 declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
 declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
+declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
+declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
+declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>)
+declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
 
 define half @test_fminimum(half %x, half %y) {
 ; CHECK-LABEL: test_fminimum:
@@ -25,20 +29,10 @@ define half @test_fminimum(half %x, half %y) {
   ret half %z
 }
 
-define <8 x half> @test_fminimum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
-; CHECK-LABEL: test_fminimum_scalarize:
+define <8 x half> @test_fminimum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; CHECK-LABEL: test_fminimum_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcmpltph %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm2 {%k1}
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vcmpeqph %xmm1, %xmm2, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
+; CHECK-NEXT:    vminph %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %r = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y)
   ret <8 x half> %r
@@ -113,19 +107,10 @@ define half @test_fmaximum(half %x, half %y) {
   ret half %r
 }
 
-define <8 x half> @test_fmaximum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
-; CHECK-LABEL: test_fmaximum_scalarize:
+define <8 x half> @test_fmaximum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; CHECK-LABEL: test_fmaximum_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcmpltph %xmm0, %xmm1, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm2 {%k1}
-; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vcmpeqph %xmm1, %xmm2, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
+; CHECK-NEXT:    vmaxph %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %r = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y)
   ret <8 x half> %r
@@ -186,3 +171,50 @@ define half @test_fmaximum_combine_cmps(half %x, half %y) {
   %2 = tail call half @llvm.maximum.f16(half %x, half %1)
   ret half %2
 }
+
+define <16 x half> @test_fminimum_v16f16(<16 x half> %x, <16 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; CHECK-LABEL: test_fminimum_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %r = call <16 x half> @llvm.minimum.v16f16(<16 x half> %x, <16 x half> %y)
+  ret <16 x half> %r
+}
+
+define <16 x half> @test_fmaximum_v16f16_nans(<16 x half> %x, <16 x half> %y) "no-signed-zeros-fp-math"="true" {
+; CHECK-LABEL: test_fmaximum_v16f16_nans:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %ymm1, %ymm0, %ymm1
+; CHECK-NEXT:    vcmpunordph %ymm0, %ymm0, %k1
+; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = call <16 x half> @llvm.maximum.v16f16(<16 x half> %x, <16 x half> %y)
+  ret <16 x half> %r
+}
+
+define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "no-nans-fp-math"="true" {
+; CHECK-LABEL: test_fminimum_v32f16_szero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovw2m %zmm0, %k1
+; CHECK-NEXT:    vpblendmw %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vminph %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %r = call <32 x half> @llvm.minimum.v32f16(<32 x half> %x, <32 x half> %y)
+  ret <32 x half> %r
+}
+
+define <32 x half> @test_fmaximum_v32f16_nans_szero(<32 x half> %x, <32 x half> %y) {
+; CHECK-LABEL: test_fmaximum_v32f16_nans_szero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovw2m %zmm0, %k1
+; CHECK-NEXT:    vpblendmw %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmaxph %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vcmpunordph %zmm1, %zmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %r = call <32 x half> @llvm.maximum.v32f16(<32 x half> %x, <32 x half> %y)
+  ret <32 x half> %r
+}
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index 6752934..1369131 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -312,7 +312,7 @@ exit:
   ret i32 %sum
 }
 
-define void @unnatural_cfg1() {
+define void @unnatural_cfg1(i1 %arg) {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
 ; CHECK-LABEL: unnatural_cfg1
@@ -327,7 +327,7 @@ loop.header:
   br label %loop.body1
 
 loop.body1:
-  br i1 undef, label %loop.body3, label %loop.body2
+  br i1 %arg, label %loop.body3, label %loop.body2
 
 loop.body2:
   %ptr = load ptr, ptr undef, align 4
@@ -341,14 +341,14 @@ loop.body3:
   br i1 %comp, label %loop.body4, label %loop.body5
 
 loop.body4:
-  br i1 undef, label %loop.header, label %loop.body5
+  br i1 %arg, label %loop.header, label %loop.body5
 
 loop.body5:
   %ptr2 = load ptr, ptr undef, align 4
   br label %loop.body3
 }
 
-define void @unnatural_cfg2(ptr %p0, i32 %a0) {
+define void @unnatural_cfg2(ptr %p0, i32 %a0, i1 %arg) {
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
@@ -372,10 +372,10 @@ loop.header:
 
 loop.body1:
   %val0 = load ptr, ptr undef, align 4
-  br i1 undef, label %loop.body2, label %loop.inner1.begin
+  br i1 %arg, label %loop.body2, label %loop.inner1.begin
 
 loop.body2:
-  br i1 undef, label %loop.body4, label %loop.body3
+  br i1 %arg, label %loop.body4, label %loop.body3
 
 loop.body3:
   %ptr1 = getelementptr inbounds i32, ptr %val0, i32 0
@@ -467,7 +467,7 @@ exit:
   ret i32 %merge
 }
 
-define void @fpcmp_unanalyzable_branch(i1 %cond, double %a0) {
+define void @fpcmp_unanalyzable_branch(i1 %cond, double %a0, i1 %arg) {
 ; This function's CFG contains an once-unanalyzable branch (une on floating
 ; points). As now it becomes analyzable, we should get best layout in which each
 ; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
@@ -493,7 +493,7 @@ entry.if.then_crit_edge:
   br label %if.then
 
 lor.lhs.false:
-  br i1 undef, label %if.end, label %exit
+  br i1 %arg, label %if.end, label %exit
 
 exit:
   %cmp.i = fcmp une double 0.000000e+00, %a0
@@ -516,7 +516,7 @@ declare i32 @f()
 declare i32 @g()
 declare i32 @h(i32 %x)
 
-define i32 @test_global_cfg_break_profitability() {
+define i32 @test_global_cfg_break_profitability(i1 %arg) {
 ; Check that our metrics for the profitability of a CFG break are global rather
 ; than local. A successor may be very hot, but if the current block isn't, it
 ; doesn't matter. Within this test the 'then' block is slightly warmer than the
@@ -530,7 +530,7 @@ define i32 @test_global_cfg_break_profitability() {
 ; CHECK: ret
 
 entry:
-  br i1 undef, label %then, label %else, !prof !2
+  br i1 %arg, label %then, label %else, !prof !2
 
 then:
   %then.result = call i32 @f()
@@ -600,7 +600,7 @@ cleanup:
   unreachable
 }
 
-define void @test_unnatural_cfg_backwards_inner_loop() {
+define void @test_unnatural_cfg_backwards_inner_loop(i1 %arg) {
 ; Test that when we encounter an unnatural CFG structure after having formed
 ; a chain for an inner loop which happened to be laid out backwards we don't
 ; attempt to merge onto the wrong end of the inner loop just because we find it
@@ -612,7 +612,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
 ; CHECK: %loop3
 
 entry:
-  br i1 undef, label %loop2a, label %body
+  br i1 %arg, label %loop2a, label %body
 
 body:
   br label %loop2a
@@ -692,7 +692,7 @@ exit:
   ret void
 }
 
-define void @unanalyzable_branch_to_free_block(float %x) {
+define void @unanalyzable_branch_to_free_block(float %x, i1 %arg) {
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
@@ -704,7 +704,7 @@ define void @unanalyzable_branch_to_free_block(float %x) {
 ; CHECK: %exit
 
 entry:
-  br i1 undef, label %a, label %b
+  br i1 %arg, label %a, label %b
 
 a:
   call i32 @f()
diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
index f6b3883..e7ffc47 100644
--- a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
+++ b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
@@ -173,7 +173,7 @@ define ghccc void @test5() {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    jmp tail@PLT # TAILCALL
 entry:
-  br i1 undef, label %then, label %else
+  br i1 poison, label %then, label %else
 
 then:
   store i64 0, ptr undef
@@ -186,4 +186,3 @@ else:
 exit:
   ret void
 }
-
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index c611e89..f3070cd 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s -o - | FileCheck %s
 ; This file tests the different cases what are involved when codegen prepare
 ; tries to get sign/zero extension out of the way of addressing mode.
@@ -9,14 +10,17 @@ target triple = "x86_64-apple-macosx"
 
 
 ; Check that we correctly promote both operands of the promotable add.
-; CHECK-LABEL: @twoArgsPromotion
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
-; CHECK: [[ARG2SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg2 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], [[ARG2SEXT]]
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i32 [[ARG2]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], [[PROMOTED2]]
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %base = inttoptr i64 %sextadd to ptr
   %res = load i8, ptr %base
@@ -28,11 +32,16 @@ define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) {
 ; Otherwise, we will increase the number of instruction executed.
 ; (This is a heuristic of course, because the new sext could have been
 ; merged with something else.)
-; CHECK-LABEL: @twoArgsNoPromotion
-; CHECK: add nsw i32 %arg1, %arg2
-; CHECK: ret
 define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, ptr %base) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsNoPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -41,11 +50,16 @@ define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, ptr %base) {
 
 ; Check that we do not promote when the related instruction does not have
 ; the nsw flag.
-; CHECK-LABEL: @noPromotion
-; CHECK-NOT: add i64
-; CHECK: ret
 define i8 @noPromotion(i32 %arg1, i32 %arg2, ptr %base) {
-  %add = add i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @noPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -53,13 +67,16 @@ define i8 @noPromotion(i32 %arg1, i32 %arg2, ptr %base) {
 }
 
 ; Check that we correctly promote constant arguments.
-; CHECK-LABEL: @oneArgPromotion
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotion(i32 %arg1, ptr %base) {
-  %add = add nsw i32 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, 1
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -67,14 +84,17 @@ define i8 @oneArgPromotion(i32 %arg1, ptr %base) {
 }
 
 ; Check that we are able to merge a sign extension with a zero extension.
-; CHECK-LABEL: @oneArgPromotionZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionZExt(i8 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %zext = zext i8 %arg1 to i32
-  %add = add nsw i32 %zext, 1 
+  %add = add nsw i32 %zext, 1
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -88,11 +108,14 @@ define i8 @oneArgPromotionZExt(i8 %arg1, ptr %base) {
 ; more thing in the addressing mode. Therefore the modification is
 ; rolled back.
 ; Still, this test case exercises the desired code path.
-; CHECK-LABEL: @oneArgPromotionCstZExt
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 0, 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionCstZExt(ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionCstZExt(
+; CHECK-SAME: ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 0, 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %cst = zext i16 undef to i32
   %add = add nsw i32 %cst, 1
   %sextadd = sext i32 %add to i64
@@ -103,15 +126,18 @@ define i8 @oneArgPromotionCstZExt(ptr %base) {
 
 ; Check that we do not promote truncate when we cannot determine the
 ; bits that are dropped.
-; CHECK-LABEL: @oneArgPromotionBlockTrunc1
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 %arg1 to i8
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTrunc1(
+; CHECK-SAME: i32 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %trunc = trunc i32 %arg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -120,17 +146,20 @@ define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, ptr %base) {
 
 ; Check that we do not promote truncate when we cannot determine all the
 ; bits that are dropped.
-; CHECK-LABEL: @oneArgPromotionBlockTrunc2
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i16 %arg1 to i32
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
-; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTrunc2(
+; CHECK-SAME: i16 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i16 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i16 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -139,15 +168,18 @@ define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, ptr %base) {
 
 ; Check that we are able to promote truncate when we know all the bits
 ; that are dropped.
-; CHECK-LABEL: @oneArgPromotionPassTruncKeepSExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionPassTruncKeepSExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i1 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -156,17 +188,19 @@ define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, ptr %base) {
 
 ; On X86 truncate are free. Check that we are able to promote the add
 ; to be used as addressing mode and that we insert a truncate for the other
-; use. 
-; CHECK-LABEL: @oneArgPromotionTruncInsert
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: add i8 [[LOAD]], [[TRUNC]]
-; CHECK: ret
+; use.
 define i8 @oneArgPromotionTruncInsert(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotionTruncInsert(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -175,15 +209,20 @@ define i8 @oneArgPromotionTruncInsert(i8 %arg1, ptr %base) {
 }
 
 ; Cannot sext from a larger type than the promoted type.
-; CHECK-LABEL: @oneArgPromotionLargerType
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i128 %arg1 to i8
-; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionLargerType(i128 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionLargerType(
+; CHECK-SAME: i128 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i128 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
   %trunc = trunc i128 %arg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -194,18 +233,20 @@ define i8 @oneArgPromotionLargerType(i128 %arg1, ptr %base) {
 ; Use same inserted trunc
 ; On X86 truncate are free. Check that we are able to promote the add
 ; to be used as addressing mode and that we insert a truncate for
-; *all* the other uses. 
-; CHECK-LABEL: @oneArgPromotionTruncInsertSeveralUse
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = add i8 [[LOAD]], [[TRUNC]]
-; CHECK: add i8 [[ADDRES]], [[TRUNC]]
-; CHECK: ret
+; *all* the other uses.
 define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotionTruncInsertSeveralUse(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ALMOSTFINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[ALMOSTFINALRES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -216,16 +257,18 @@ define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, ptr %base) {
 
 ; Check that the promoted instruction is used for all uses of the original
 ; sign extension.
-; CHECK-LABEL: @oneArgPromotionSExtSeveralUse
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = zext i8 [[LOAD]] to i64
-; CHECK: add i64 [[ADDRES]], [[PROMOTED]]
-; CHECK: ret
 define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i64 @oneArgPromotionSExtSeveralUse(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ALMOSTFINALRES:%.*]] = zext i8 [[RES]] to i64
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i64 [[ALMOSTFINALRES]], [[ADD]]
+; CHECK-NEXT:    ret i64 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -249,16 +292,19 @@ define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, ptr %base) {
 ; - Setting the operands of the promoted instruction with the promoted values.
 ; - Moving instruction around (mainly sext when promoting instruction).
 ; Each type of those promotions has to be undo at least once during this
-; specific test. 
-; CHECK-LABEL: @twoArgsPromotionNest
-; CHECK: [[ORIG:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ORIG]], [[ORIG]]
-; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[SEXT]]
-; CHECK: ret
+; specific test.
 define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, ptr %base) {
+; CHECK-LABEL: define i8 @twoArgsPromotionNest(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTABLEADD1:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[PROMOTABLEADD2:%.*]] = add nsw i32 [[PROMOTABLEADD1]], [[PROMOTABLEADD1]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[PROMOTABLEADD2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %promotableadd1 = add nsw i32 %arg1, %arg2
-  %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1 
+  %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1
   %sextadd = sext i32 %promotableadd2 to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -270,18 +316,21 @@ define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, ptr %base) {
 ; The matcher first promotes the add, removes the trunc and promotes
 ; the sext of arg1.
 ; Then, the matcher cannot use an addressing mode r + r + r, thus it
-; rolls back. 
-; CHECK-LABEL: @twoArgsNoPromotionRemove
-; CHECK: [[SEXTARG1:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[SEXTARG1]] to i8
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[TRUNC]], %arg2
-; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i64
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[SEXT]]
-; CHECK: ret
+; rolls back.
 define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, ptr %base) {
+; CHECK-LABEL: define i8 @twoArgsNoPromotionRemove(
+; CHECK-SAME: i1 [[ARG1:%.*]], i8 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[TRUNC]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i8 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, %arg2 
+  %add = add nsw i8 %trunc, %arg2
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -301,29 +350,40 @@ define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, ptr %base) {
 ; Check that we did not promote anything in the final matching.
 ;
 ; <rdar://problem/16020230>
-; CHECK-LABEL: @checkProfitability
-; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
-; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
-; CHECK: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
-; CHECK: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
 ; BB then
-; CHECK: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to ptr
-; CHECK: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[BASE1]], i64 48
-; CHECK: load i32, ptr [[FULL1]]
 ; BB else
-; CHECK: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to ptr
-; CHECK: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[BASE2]], i64 48
-; CHECK: load i32, ptr [[FULL2]]
-; CHECK: ret
 define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
+; CHECK-LABEL: define i32 @checkProfitability(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i1 [[TEST:%.*]]) {
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i32 [[ARG1]], 1
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[SHL]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTIDX1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    br i1 [[TEST]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = inttoptr i64 [[SEXTIDX1]] to ptr
+; CHECK-NEXT:    [[SUNKADDR13:%.*]] = getelementptr i8, ptr [[SUNKADDR]], i64 48
+; CHECK-NEXT:    [[RES1:%.*]] = load i32, ptr [[SUNKADDR13]], align 4
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[SUNKADDR17:%.*]] = inttoptr i64 [[SEXTIDX1]] to ptr
+; CHECK-NEXT:    [[SUNKADDR18:%.*]] = getelementptr i8, ptr [[SUNKADDR17]], i64 48
+; CHECK-NEXT:    [[RES2:%.*]] = load i32, ptr [[SUNKADDR18]], align 4
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[RES1]], %[[THEN]] ], [ [[RES2]], %[[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[SEXTIDX1]] to i32
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[TMP]], [[TMP1]]
+; CHECK-NEXT:    [[ADDR:%.*]] = inttoptr i32 [[RES]] to ptr
+; CHECK-NEXT:    [[FINAL:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    ret i32 [[FINAL]]
+;
   %shl = shl nsw i32 %arg1, 1
   %add1 = add nsw i32 %shl, %arg2
   %sextidx1 = sext i32 %add1 to i64
   %tmpptr = inttoptr i64 %sextidx1 to ptr
   %arrayidx1 = getelementptr i32, ptr %tmpptr, i64 12
   br i1 %test, label %then, label %else
-then: 
+then:
   %res1 = load i32, ptr %arrayidx1
   br label %end
 else:
@@ -346,15 +406,47 @@ end:
 ; We used to crash on this function because we did not return the right
 ; promoted instruction for %conv.i.
 ; Make sure we generate the right code now.
-; CHECK-LABEL: @fn3
 ; %conv.i is used twice and only one of its use is being promoted.
 ; Use it at the starting point for the matching.
-; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32
-; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64
-; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr %P, i64 [[PROMOTED_CONV]]
-; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[ADD]], i64 7
-; CHECK-NEXT: load i8, ptr [[ADDR]], align 1
 define signext i16 @fn3(ptr nocapture readonly %P) {
+; CHECK-LABEL: define signext i16 @fn3(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_BODY_I_I:.*]]
+; CHECK:       [[WHILE_BODY_I_I]]:
+; CHECK-NEXT:    [[SRC_ADDR_0_I_I:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[INC_I_I:%.*]], %[[WHILE_BODY_I_I]] ]
+; CHECK-NEXT:    [[INC_I_I]] = add i16 [[SRC_ADDR_0_I_I]], 1
+; CHECK-NEXT:    [[IDXPROM_I_I:%.*]] = sext i16 [[SRC_ADDR_0_I_I]] to i64
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IDXPROM_I_I]]
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr inbounds i8, ptr [[SUNKADDR]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[SUNKADDR2]], align 1
+; CHECK-NEXT:    [[CONV2_I_I:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV2_I_I]], 15
+; CHECK-NEXT:    store i32 [[AND_I_I]], ptr @a, align 4
+; CHECK-NEXT:    [[TOBOOL_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_I_I]], label %[[WHILE_BODY_I_I]], label %[[FN1_EXIT_I:.*]]
+; CHECK:       [[FN1_EXIT_I]]:
+; CHECK-NEXT:    [[CONV_I:%.*]] = zext i16 [[INC_I_I]] to i32
+; CHECK-NEXT:    [[PROMOTED4:%.*]] = zext i16 [[INC_I_I]] to i64
+; CHECK-NEXT:    [[SUNKADDR5:%.*]] = getelementptr i8, ptr [[P]], i64 [[PROMOTED4]]
+; CHECK-NEXT:    [[SUNKADDR6:%.*]] = getelementptr i8, ptr [[SUNKADDR5]], i64 7
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[SUNKADDR6]], align 1
+; CHECK-NEXT:    [[CONV2_I:%.*]] = sext i8 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[CONV2_I]], ptr @b, align 2
+; CHECK-NEXT:    [[SUB4_I:%.*]] = sub nsw i32 0, [[CONV_I]]
+; CHECK-NEXT:    [[CONV5_I:%.*]] = zext i16 [[CONV2_I]] to i32
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[CONV5_I]], [[SUB4_I]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[IF_THEN_I:.*]], label %[[FN2_EXIT:.*]]
+; CHECK:       [[IF_THEN_I]]:
+; CHECK-NEXT:    [[END_I:%.*]] = getelementptr inbounds [[STRUCT_DNS_PACKET:%.*]], ptr [[P]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[END_I]], align 4
+; CHECK-NEXT:    [[SUB7_I:%.*]] = add i32 [[TMP3]], 65535
+; CHECK-NEXT:    [[CONV8_I:%.*]] = trunc i32 [[SUB7_I]] to i16
+; CHECK-NEXT:    br label %[[FN2_EXIT]]
+; CHECK:       [[FN2_EXIT]]:
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i16 [ [[CONV8_I]], %[[IF_THEN_I]] ], [ undef, %[[FN1_EXIT_I]] ]
+; CHECK-NEXT:    ret i16 [[RETVAL_0_I]]
+;
 entry:
   %tmp = getelementptr inbounds %struct.dns_packet, ptr %P, i64 0, i32 2
   br label %while.body.i.i
@@ -399,13 +491,16 @@ fn2.exit:                                         ; preds = %if.then.i, %fn1.exi
 
 ; Check that we do not promote an extension if the non-wrapping flag does not
 ; match the kind of the extension.
-; CHECK-LABEL: @noPromotionFlag
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @noPromotionFlag(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[ZEXTADD:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ZEXTADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %zextadd = zext i32 %add to i64
   %base = inttoptr i64 %zextadd to ptr
   %res = load i8, ptr %base
@@ -413,14 +508,17 @@ define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
 }
 
 ; Check that we correctly promote both operands of the promotable add with zext.
-; CHECK-LABEL: @twoArgsPromotionZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64
-; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]]
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
-  %add = add nuw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsPromotionZExt(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i32 [[ARG2]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], [[PROMOTED2]]
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nuw i32 %arg1, %arg2
   %zextadd = zext i32 %add to i64
   %base = inttoptr i64 %zextadd to ptr
   %res = load i8, ptr %base
@@ -428,13 +526,16 @@ define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
 }
 
 ; Check that we correctly promote constant arguments.
-; CHECK-LABEL: @oneArgPromotionNegativeCstZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, ptr %base) {
-  %add = add nuw i8 %arg1, -1 
+; CHECK-LABEL: define i8 @oneArgPromotionNegativeCstZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 255
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nuw i8 %arg1, -1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -442,14 +543,17 @@ define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, ptr %base) {
 }
 
 ; Check that we are able to merge two zero extensions.
-; CHECK-LABEL: @oneArgPromotionZExtZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionZExtZExt(i8 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionZExtZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %zext = zext i8 %arg1 to i32
-  %add = add nuw i32 %zext, 1 
+  %add = add nuw i32 %zext, 1
   %zextadd = zext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -458,17 +562,20 @@ define i8 @oneArgPromotionZExtZExt(i8 %arg1, ptr %base) {
 
 ; Check that we do not promote truncate when the dropped bits
 ; are of a different kind.
-; CHECK-LABEL: @oneArgPromotionBlockTruncZExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTruncZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nuw i8 %trunc, 1 
+  %add = add nuw i8 %trunc, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -477,15 +584,18 @@ define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, ptr %base) {
 
 ; Check that we are able to promote truncate when we know all the bits
 ; that are dropped.
-; CHECK-LABEL: @oneArgPromotionPassTruncZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionPassTruncZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i1 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = zext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nuw i8 %trunc, 1 
+  %add = add nuw i8 %trunc, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -493,15 +603,18 @@ define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, ptr %base) {
 }
 
 ; Check that we do not promote sext with zext.
-; CHECK-LABEL: @oneArgPromotionBlockSExtZExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockSExtZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[SEXTARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i8
-  %add = add nuw i8 %sextarg1, 1 
+  %add = add nuw i8 %sextarg1, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir b/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
index aceb344..13f3f3a 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
+++ b/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
@@ -6,7 +6,7 @@
 # "Replacement PHI node is already replaced."
 
 --- |
-  define void @f1() {
+  define void @f1(i1 %arg) {
   entry:
     %arrayidx = getelementptr inbounds [2 x i16], ptr undef, i16 0, i16 2
     br label %for.cond
@@ -30,10 +30,10 @@
     %5 = phi ptr [ %4, %for.body ], [ %5, %if.then5 ], [ undef, %for.cond2 ]
     %6 = phi ptr [ %3, %for.body ], [ %6, %if.then5 ], [ undef, %for.cond2 ]
     %7 = phi ptr [ %2, %for.body ], [ %6, %if.then5 ], [ undef, %for.cond2 ]
-    br i1 undef, label %for.cond2, label %if.then5
+    br i1 %arg, label %for.cond2, label %if.then5
 
   if.then5:
-    br i1 undef, label %cleanup, label %for.cond2
+    br i1 %arg, label %cleanup, label %for.cond2
 
   cleanup:
     br i1 true, label %for.cond, label %for.body
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir b/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
index 6159aa8a..e93e04b 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
+++ b/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
@@ -7,7 +7,7 @@
 
 --- |
 
-  define void @f1() {
+  define void @f1(i1 %arg) {
   entry:
     %arrayidx = getelementptr inbounds [2 x i16], ptr undef, i16 0, i16 2
     br label %for.cond
@@ -24,7 +24,7 @@
     %2 = phi ptr [ %1, %for.cond ], [ %12, %cleanup ]
     %3 = phi ptr [ %0, %for.cond ], [ %11, %cleanup ]
     %4 = phi ptr [ %0, %for.cond ], [ %10, %cleanup ]
-    br i1 undef, label %for.cond2.preheader, label %if.then
+    br i1 %arg, label %for.cond2.preheader, label %if.then
 
   for.cond2.preheader:
     br label %for.cond2
@@ -37,7 +37,7 @@
     %5 = phi ptr [ %8, %for.inc ], [ %4, %for.cond2.preheader ]
     %6 = phi ptr [ %9, %for.inc ], [ %3, %for.cond2.preheader ]
     %7 = phi ptr [ %9, %for.inc ], [ %2, %for.cond2.preheader ]
-    br i1 undef, label %for.inc, label %if.then5
+    br i1 %arg, label %for.inc, label %if.then5
 
   if.then5:
     br i1 true, label %cleanup.loopexit, label %if.end
diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll
index 230afd1..7237b02 100644
--- a/llvm/test/CodeGen/X86/combine-concatvectors.ll
+++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll
@@ -72,7 +72,7 @@ alloca_0:
   br label %loop.4942
 
 loop.4942:                                        ; preds = %loop.4942, %alloca_0
-  br i1 undef, label %loop.4942, label %ifmerge.1298
+  br i1 poison, label %loop.4942, label %ifmerge.1298
 
 ifmerge.1298:                                     ; preds = %loop.4942
   %gepload4638 = load float, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 28324), align 4
diff --git a/llvm/test/CodeGen/X86/crash.ll b/llvm/test/CodeGen/X86/crash.ll
index 16e3bb6..2f49a60 100644
--- a/llvm/test/CodeGen/X86/crash.ll
+++ b/llvm/test/CodeGen/X86/crash.ll
@@ -115,9 +115,9 @@ do.body92:                                        ; preds = %if.then66
 ; Crash during XOR optimization.
 ; <rdar://problem/7869290>
 
-define void @test7() nounwind ssp {
+define void @test7(i1 %arg) nounwind ssp {
 entry:
-  br i1 undef, label %bb14, label %bb67
+  br i1 %arg, label %bb14, label %bb67
 
 bb14:
   %tmp0 = trunc i16 undef to i1
@@ -157,14 +157,14 @@ entry:
 ; shift of and.
 %struct.S0 = type { i8, [2 x i8], i8 }
 
-define void @func_59(i32 %p_63) noreturn nounwind {
+define void @func_59(i32 %p_63, i1 %arg) noreturn nounwind {
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.inc44, %entry
   %p_63.addr.1 = phi i32 [ %p_63, %entry ], [ 0, %for.inc44 ]
   %l_74.0 = phi i32 [ 0, %entry ], [ %add46, %for.inc44 ]
-  br i1 undef, label %for.inc44, label %bb.nph81
+  br i1 %arg, label %for.inc44, label %bb.nph81
 
 bb.nph81:                                         ; preds = %for.body
   %tmp98 = add i32 %p_63.addr.1, 0
@@ -237,7 +237,7 @@ declare i64 @llvm.objectsize.i64.p0(ptr, i1) nounwind readnone
 %t20 = type { i32, i32 }
 %t21 = type { ptr }
 
-define void @_ZNK4llvm17MipsFrameLowering12emitPrologueERNS_15MachineFunctionE() ssp align 2 {
+define void @_ZNK4llvm17MipsFrameLowering12emitPrologueERNS_15MachineFunctionE(i1 %arg) ssp align 2 {
 bb:
   %tmp = load ptr, ptr undef, align 4
   %tmp3 = getelementptr inbounds %t9, ptr %tmp, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -246,7 +246,7 @@ bb:
 bb4:                                              ; preds = %bb37, %bb
   %tmp5 = phi i96 [ undef, %bb ], [ %tmp38, %bb37 ]
   %tmp6 = phi i96 [ undef, %bb ], [ %tmp39, %bb37 ]
-  br i1 undef, label %bb34, label %bb7
+  br i1 %arg, label %bb34, label %bb7
 
 bb7:                                              ; preds = %bb4
   %tmp8 = load i32, ptr undef, align 4
@@ -292,7 +292,7 @@ bb33:                                             ; preds = %bb29
   unreachable
 
 bb34:                                             ; preds = %bb4
-  br i1 undef, label %bb36, label %bb35
+  br i1 %arg, label %bb36, label %bb35
 
 bb35:                                             ; preds = %bb34
   store ptr null, ptr %tmp3, align 4
@@ -319,7 +319,7 @@ declare void @llvm.lifetime.end.p0(i64, ptr nocapture) nounwind
 
 ; PR10463
 ; Spilling a virtual register with <undef> uses.
-define void @autogen_239_1000() {
+define void @autogen_239_1000(i1 %arg) {
 BB:
     %Shuff = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 undef>
     br label %CF
@@ -327,14 +327,14 @@ BB:
 CF:
     %B16 = frem <8 x double> zeroinitializer, %Shuff
     %E19 = extractelement <8 x double> %Shuff, i32 5
-    br i1 undef, label %CF, label %CF75
+    br i1 %arg, label %CF, label %CF75
 
 CF75:
-    br i1 undef, label %CF75, label %CF76
+    br i1 %arg, label %CF75, label %CF76
 
 CF76:
     store double %E19, ptr undef
-    br i1 undef, label %CF76, label %CF77
+    br i1 %arg, label %CF76, label %CF77
 
 CF77:
     %B55 = fmul <8 x double> %B16, undef
@@ -396,24 +396,24 @@ if.end:
 ; InstrEmitter::EmitSubregNode() may steal virtual registers from already
 ; emitted blocks when isCoalescableExtInstr points out the opportunity.
 ; Make sure kill flags are cleared on the newly global virtual register.
-define i64 @ov_read(ptr %vf, ptr nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, ptr %bitstream) nounwind uwtable ssp {
+define i64 @ov_read(ptr %vf, ptr nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, ptr %bitstream, i1 %arg) nounwind uwtable ssp {
 entry:
-  br i1 undef, label %return, label %while.body.preheader
+  br i1 %arg, label %return, label %while.body.preheader
 
 while.body.preheader:                             ; preds = %entry
-  br i1 undef, label %if.then3, label %if.end7
+  br i1 %arg, label %if.then3, label %if.end7
 
 if.then3:                                         ; preds = %while.body.preheader
   %0 = load i32, ptr undef, align 4
-  br i1 undef, label %land.lhs.true.i255, label %if.end7
+  br i1 %arg, label %land.lhs.true.i255, label %if.end7
 
 land.lhs.true.i255:                               ; preds = %if.then3
-  br i1 undef, label %if.then.i256, label %if.end7
+  br i1 %arg, label %if.then.i256, label %if.end7
 
 if.then.i256:                                     ; preds = %land.lhs.true.i255
   %sub.i = sub i32 0, %0
   %conv = sext i32 %sub.i to i64
-  br i1 undef, label %if.end7, label %while.end
+  br i1 %arg, label %if.end7, label %while.end
 
 if.end7:                                          ; preds = %if.then.i256, %land.lhs.true.i255, %if.then3, %while.body.preheader
   unreachable
@@ -486,12 +486,12 @@ declare void @fn3(...)
 ; When coalescing %1 and %2, the IMPLICIT_DEF instruction should be
 ; erased along with its value number.
 ;
-define void @rdar12474033() nounwind ssp {
+define void @rdar12474033(i1 %arg, i32 %arg2, i32 %arg3, i32 %arg4) nounwind ssp {
 bb:
-  br i1 undef, label %bb21, label %bb1
+  br i1 %arg, label %bb21, label %bb1
 
 bb1:                                              ; preds = %bb
-  switch i32 undef, label %bb10 [
+  switch i32 %arg2, label %bb10 [
     i32 4, label %bb2
     i32 1, label %bb9
     i32 5, label %bb3
@@ -503,7 +503,7 @@ bb2:                                              ; preds = %bb1
   unreachable
 
 bb3:                                              ; preds = %bb1, %bb1
-  br i1 undef, label %bb4, label %bb5
+  br i1 %arg, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb3
   unreachable
@@ -521,7 +521,7 @@ bb9:                                              ; preds = %bb1, %bb1
 bb10:                                             ; preds = %bb5, %bb1
   %tmp11 = phi i128 [ undef, %bb1 ], [ %tmp6, %bb5 ]
   %tmp12 = phi i128 [ 0, %bb1 ], [ %tmp8, %bb5 ]
-  switch i32 undef, label %bb21 [
+  switch i32 %arg3, label %bb21 [
     i32 2, label %bb18
     i32 3, label %bb13
     i32 5, label %bb16
@@ -530,7 +530,7 @@ bb10:                                             ; preds = %bb5, %bb1
   ]
 
 bb13:                                             ; preds = %bb10
-  br i1 undef, label %bb15, label %bb14
+  br i1 %arg, label %bb15, label %bb14
 
 bb14:                                             ; preds = %bb13
   br label %bb21
@@ -554,7 +554,7 @@ bb21:                                             ; preds = %bb18, %bb14, %bb10,
   %tmp23 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp19, %bb18 ]
   store <4 x float> %tmp23, ptr undef, align 16
   store <4 x float> %tmp22, ptr undef, align 16
-  switch i32 undef, label %bb29 [
+  switch i32 %arg4, label %bb29 [
     i32 5, label %bb27
     i32 1, label %bb24
     i32 2, label %bb25
diff --git a/llvm/test/CodeGen/X86/domain-reassignment-test.ll b/llvm/test/CodeGen/X86/domain-reassignment-test.ll
index af7aca6..77c1ef2 100644
--- a/llvm/test/CodeGen/X86/domain-reassignment-test.ll
+++ b/llvm/test/CodeGen/X86/domain-reassignment-test.ll
@@ -3,7 +3,7 @@
 
 ; Check that the X86 domain reassignment pass doesn't introduce an illegal
 ; test instruction. See PR37396
-define void @japi1_foo2_34617() {
+define void @japi1_foo2_34617(i1 %arg) {
 pass2:
   br label %if5
 
@@ -27,7 +27,7 @@ if5:
   %tmp120 = and i1 %tmp118, %tmp119
   %tmp121 = zext i1 %tmp120 to i8
   %tmp122 = and i8 %b.055, %tmp121
-  br i1 undef, label %L174, label %if5
+  br i1 %arg, label %L174, label %if5
 
 L188:
   unreachable
diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
index 8d8d4fa..4a5cddb 100644
--- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
@@ -5,9 +5,9 @@
 ; The machine verifier will catch and complain about this case.
 ; CHECK-LABEL: baz
 ; CHECK: retq
-define void @baz() {
+define void @baz(i1 %arg) {
 entry:
-  br i1 undef, label %exit, label %exit
+  br i1 %arg, label %exit, label %exit
 
 exit:
   ret void
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index c6da0c5..1dcce53 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
 
 declare float @llvm.maximum.f32(float, float)
@@ -73,6 +74,11 @@ define float @test_fmaximum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -110,6 +116,11 @@ define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no-
 ; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_scalarize:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
@@ -129,6 +140,11 @@ define float @test_fmaximum_nan0(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nan0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -148,6 +164,11 @@ define float @test_fmaximum_nan1(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nan1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -215,6 +236,13 @@ define float @test_fmaximum_nnan(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX10_2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxss $1, %xmm0, %xmm2
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nnan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -272,6 +300,12 @@ define double @test_fmaximum_zero0(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxsd $1, %xmm0, %xmm1
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -323,6 +357,12 @@ define double @test_fmaximum_zero1(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxsd $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -354,6 +394,11 @@ define double @test_fmaximum_zero2(double %x, double %y) {
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldz
@@ -390,6 +435,11 @@ define float @test_fmaximum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="t
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nsz:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -474,6 +524,12 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_combine_cmps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -562,6 +618,11 @@ define float @test_fminimum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -599,6 +660,11 @@ define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) "
 ; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_scalarize:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
@@ -618,6 +684,11 @@ define float @test_fminimum_nan0(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nan0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -637,6 +708,11 @@ define float @test_fminimum_nan1(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nan1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -695,6 +771,11 @@ define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true"
 ; AVX512DQ-NEXT:    vminsd %xmm2, %xmm1, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nnan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -749,6 +830,11 @@ define double @test_fminimum_zero0(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -796,6 +882,11 @@ define double @test_fminimum_zero1(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -826,6 +917,11 @@ define double @test_fminimum_zero2(double %x, double %y) {
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldz
@@ -863,6 +959,11 @@ define float @test_fminimum_nsz(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nsz:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -948,6 +1049,12 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vminss %xmm2, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_combine_cmps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -1009,6 +1116,11 @@ define <2 x double> @test_fminimum_vector(<2 x double> %x, <2 x double> %y) {
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
@@ -1032,6 +1144,11 @@ define <4 x float> @test_fmaximum_vector(<4 x float> %x, <4 x float> %y) "no-nan
 ; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
@@ -1054,6 +1171,12 @@ define <2 x double> @test_fminimum_vector_zero(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1077,6 +1200,11 @@ define <4 x float> @test_fmaximum_vector_signed_zero(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_signed_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -1102,6 +1230,13 @@ define <2 x double> @test_fminimum_vector_partially_zero(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_partially_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_partially_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1149,6 +1284,13 @@ define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) {
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_different_zeros:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_different_zeros:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1177,6 +1319,11 @@ define <4 x float> @test_fmaximum_vector_non_zero(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_non_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_non_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
@@ -1206,6 +1353,13 @@ define <2 x double> @test_fminimum_vector_nan(<2 x double> %x) {
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_nan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_nan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1232,6 +1386,12 @@ define <2 x double> @test_fminimum_vector_zero_first(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1260,6 +1420,11 @@ define <2 x double> @test_fminimum_vector_signed_zero(<2 x double> %x) {
 ; AVX-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_signed_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
@@ -1284,6 +1449,11 @@ define <4 x float> @test_fmaximum_vector_signed_zero_first(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_signed_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_signed_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -1314,6 +1484,12 @@ define <4 x float> @test_fmaximum_vector_zero(<4 x float> %x) {
 ; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
@@ -1369,6 +1545,12 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
 ; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4f32_splat:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4f32_splat:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
@@ -1803,6 +1985,11 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4f16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxph $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $164, %esp
@@ -2330,6 +2517,11 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4bf16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxnepbf16 $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4bf16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
new file mode 100644
index 0000000..2e9e8e6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -0,0 +1,2765 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
+
+declare float @llvm.maximumnum.f32(float, float)
+declare double @llvm.maximumnum.f64(double, double)
+declare float @llvm.minimumnum.f32(float, float)
+declare double @llvm.minimumnum.f64(double, double)
+declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>)
+declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+
+;
+; fmaximumnum
+;
+
+define float @test_fmaximumnum(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    js .LBB0_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB0_2:
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordss %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB0_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB0_4:
+; SSE2-NEXT:    maxss %xmm1, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB0_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    jmp .LBB0_3
+; AVX1-NEXT:  .LBB0_1:
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:  .LBB0_3:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    testl %eax, %eax
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm2, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB0_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB0_3
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB0_3:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define <4 x float> @test_fmaximumnum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fmaximumnum_scalarize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    maxps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_scalarize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_scalarize:
+; X86:       # %bb.0:
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %r
+}
+
+define float @test_fmaximumnum_nan0(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_nan0:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nan0:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float 0x7fff000000000000, float %y)
+  ret float %1
+}
+
+define float @test_fmaximumnum_nan1(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_nan1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nan1:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float 0x7fff000000000000)
+  ret float %1
+}
+
+define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    addss %xmm1, %xmm2
+; SSE2-NEXT:    subss %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    js .LBB4_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    maxss %xmm2, %xmm0
+; SSE2-NEXT:    retq
+; SSE2-NEXT:  .LBB4_1:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    maxss %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_nnan:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm2, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB4_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB4_1:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fmaximumnum_nnan:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovd %xmm2, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm2, %xmm1
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fmaximumnum_nnan:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm2, %xmm1
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX10_2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxss $17, %xmm0, %xmm2
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vaddss %xmm0, %xmm2, %xmm1
+; X86-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB4_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm2
+; X86-NEXT:    vmovaps %xmm1, %xmm0
+; X86-NEXT:  .LBB4_3:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fadd nnan float %x, %y
+  %2 = fsub nnan float %x, %y
+  %3 = tail call float @llvm.maximumnum.f32(float %1, float %2)
+  ret float %3
+}
+
+define double @test_fmaximumnum_zero0(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpordsd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm1, %xmm2
+; SSE2-NEXT:    xorpd %xmm3, %xmm3
+; SSE2-NEXT:    maxsd %xmm3, %xmm1
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_zero0:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordsd %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_zero0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxsd $17, %xmm0, %xmm1
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero0:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double 0.0, double %y)
+  ret double %1
+}
+
+define double @test_fmaximumnum_zero1(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    cmpordsd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    andpd %xmm0, %xmm2
+; SSE2-NEXT:    xorpd %xmm3, %xmm3
+; SSE2-NEXT:    maxsd %xmm3, %xmm0
+; SSE2-NEXT:    andnpd %xmm0, %xmm1
+; SSE2-NEXT:    orpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_zero1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_zero1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxsd $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double %x, double 0.0)
+  ret double %1
+}
+
+define double @test_fmaximumnum_zero2(double %x, double %y) {
+; SSE2-LABEL: test_fmaximumnum_zero2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_zero2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero2:
+; X86:       # %bb.0:
+; X86-NEXT:    fldz
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double 0.0, double -0.0)
+  ret double %1
+}
+
+define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind {
+; SSE2-LABEL: test_fmaximumnum_nsz:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpordss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm0, %xmm3
+; SSE2-NEXT:    maxss %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_nsz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_nsz:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nsz:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_combine_cmps:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    divss %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    js .LBB9_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:  .LBB9_2:
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB9_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:  .LBB9_4:
+; SSE2-NEXT:    maxss %xmm1, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_combine_cmps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB9_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm2
+; AVX1-NEXT:    jmp .LBB9_3
+; AVX1-NEXT:  .LBB9_1:
+; AVX1-NEXT:    vmovaps %xmm1, %xmm2
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:  .LBB9_3:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fmaximumnum_combine_cmps:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm0, %xmm2
+; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fmaximumnum_combine_cmps:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_combine_cmps:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB9_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm2
+; X86-NEXT:    vmovaps %xmm1, %xmm0
+; X86-NEXT:  .LBB9_3:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fdiv nnan float %y, %x
+  %2 = tail call float @llvm.maximumnum.f32(float %x, float %1)
+  ret float %2
+}
+
+;
+; fminimumnum
+;
+
+define float @test_fminimumnum(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    js .LBB10_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB10_2:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB10_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:  .LBB10_4:
+; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB10_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX1-NEXT:    jmp .LBB10_3
+; AVX1-NEXT:  .LBB10_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX1-NEXT:  .LBB10_3:
+; AVX1-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    testl %eax, %eax
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB10_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB10_3
+; X86-NEXT:  .LBB10_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB10_3:
+; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define <2 x double> @test_fminimumnum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fminimumnum_scalarize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    minpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_scalarize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_scalarize:
+; X86:       # %bb.0:
+; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %r
+}
+
+define float @test_fminimumnum_nan0(float %x, float %y) {
+; SSE2-LABEL: test_fminimumnum_nan0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_nan0:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nan0:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float 0x7fff000000000000, float %y)
+  ret float %1
+}
+
+define float @test_fminimumnum_nan1(float %x, float %y) {
+; SSE2-LABEL: test_fminimumnum_nan1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_nan1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nan1:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float %x, float 0x7fff000000000000)
+  ret float %1
+}
+
+define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind {
+; SSE2-LABEL: test_fminimumnum_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB14_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    minsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+; SSE2-NEXT:  .LBB14_1:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    minsd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_nnan:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    testq %rax, %rax
+; AVX1-NEXT:    js .LBB14_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB14_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    vminsd %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fminimumnum_nnan:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    testq %rax, %rax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovapd %xmm1, %xmm2
+; AVX512F-NEXT:    vmovsd %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vminsd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fminimumnum_nnan:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vfpclasssd $5, %xmm1, %k0 # k0 = isQuietNaN(xmm1) | isNegativeZero(xmm1)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovapd %xmm0, %xmm2
+; AVX512DQ-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512DQ-NEXT:    vminsd %xmm2, %xmm1, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vextractps $1, %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB14_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovapd %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB14_3
+; X86-NEXT:  .LBB14_1:
+; X86-NEXT:    vmovapd %xmm0, %xmm2
+; X86-NEXT:    vmovapd %xmm1, %xmm0
+; X86-NEXT:  .LBB14_3:
+; X86-NEXT:    vminsd %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero0(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_zero0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpordsd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm1, %xmm2
+; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_zero0:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpordsd %xmm1, %xmm1, %xmm0
+; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_zero0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero0:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double -0.0, double %y)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero1(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_zero1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    cmpordsd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    andpd %xmm0, %xmm2
+; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    andnpd %xmm0, %xmm1
+; SSE2-NEXT:    orpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_zero1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_zero1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double %x, double -0.0)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero2(double %x, double %y) {
+; SSE2-LABEL: test_fminimumnum_zero2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_zero2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero2:
+; X86:       # %bb.0:
+; X86-NEXT:    fldz
+; X86-NEXT:    fchs
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double -0.0, double 0.0)
+  ret double %1
+}
+
+define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_nsz:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpordss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm0, %xmm3
+; SSE2-NEXT:    minss %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_nsz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_nsz:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nsz:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call nsz float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_combine_cmps:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    divss %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    js .LBB19_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:  .LBB19_2:
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB19_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:  .LBB19_4:
+; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_combine_cmps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB19_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB19_3
+; AVX1-NEXT:  .LBB19_1:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm0
+; AVX1-NEXT:  .LBB19_3:
+; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fminimumnum_combine_cmps:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512F-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX512F-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fminimumnum_combine_cmps:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vfpclassss $5, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isNegativeZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vminss %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_combine_cmps:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vdivss %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB19_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB19_3
+; X86-NEXT:  .LBB19_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm1
+; X86-NEXT:    vmovaps %xmm2, %xmm0
+; X86-NEXT:  .LBB19_3:
+; X86-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fdiv nnan float %y, %x
+  %2 = tail call float @llvm.minimumnum.f32(float %x, float %1)
+  ret float %2
+}
+
+define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: test_fminimumnum_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector:
+; X86:       # %bb.0:
+; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fmaximumnum_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    maxps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector:
+; X86:       # %bb.0:
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_signed_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_signed_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_signed_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_partially_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_partially_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_partially_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_different_zeros:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    orps %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    minpd %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_different_zeros:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_different_zeros:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_different_zeros:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_non_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_non_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_non_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_non_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 5., float 4., float 3., float 2.>)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_nan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm2, %xmm2
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_nan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
+; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_nan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_nan:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpordpd %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_zero_first:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_zero_first:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_zero_first:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_signed_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_signed_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    minpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    cmpordpd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm2, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_signed_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_signed_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double -0., double -0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    maxps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    cmpordps %xmm0, %xmm1
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 0., float 0., float 0., float 0.>)
+  ret <4 x float> %r
+}
+
+; PR77805: Check that signed zeroes are handled correctly in this case (FIXME)
+define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_v4f32_splat:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    movaps %xmm1, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    orps %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    maxps %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    cmpordps %xmm0, %xmm2
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX512-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4f32_splat:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %splatinsert = insertelement <4 x float> poison, float %y, i64 0
+  %vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %vec) readnone
+  ret <4 x float> %r
+}
+
+define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_v4f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    subq $104, %rsp
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_2:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_4:
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_6
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_6:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_8
+; SSE2-NEXT:  # %bb.7:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_8:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_10
+; SSE2-NEXT:  # %bb.9:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_10:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    cmpordss %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_12
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:  .LBB33_12:
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd (%rsp), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_14
+; SSE2-NEXT:  # %bb.13:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_14:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_16
+; SSE2-NEXT:  # %bb.15:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_16:
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $104, %rsp
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    subq $120, %rsp
+; AVX1-NEXT:    vmovaps %xmm0, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_3
+; AVX1-NEXT:  .LBB33_1:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_3:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_4
+; AVX1-NEXT:  # %bb.5:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_6
+; AVX1-NEXT:  .LBB33_4:
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_6:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_7
+; AVX1-NEXT:  # %bb.8:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_9
+; AVX1-NEXT:  .LBB33_7:
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_9:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_10
+; AVX1-NEXT:  # %bb.11:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_12
+; AVX1-NEXT:  .LBB33_10:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_12:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; AVX1-NEXT:    addq $120, %rsp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $88, %rsp
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm4
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm6
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
+; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm2
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm2
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm9
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vxorps %xmm10, %xmm10, %xmm10
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm2, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm5
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm5
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm5, %xmm3, %xmm3 {%k1}
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm4[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT:    vucomiss %xmm5, %xmm5
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm5, %xmm5 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm5, %xmm15
+; AVX512-NEXT:    vcvtph2ps %xmm15, %xmm5
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vmulss %xmm3, %xmm9, %xmm3
+; AVX512-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm0, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm11
+; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm7
+; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm12
+; AVX512-NEXT:    vcvtph2ps %xmm12, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm14
+; AVX512-NEXT:    vmovd %xmm14, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm13
+; AVX512-NEXT:    vmovd %xmm13, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm3
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm8
+; AVX512-NEXT:    vcvtph2ps %xmm8, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    vucomiss %xmm1, %xmm1
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT:    vucomiss %xmm4, %xmm4
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm1, %xmm4, %xmm4 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm6
+; AVX512-NEXT:    vmovss %xmm6, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm6
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm6, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm9
+; AVX512-NEXT:    vmovd %xmm9, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm10
+; AVX512-NEXT:    vmovd %xmm10, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm6 = xmm0[0],mem[0]
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm0, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm2, %eax
+; AVX512-NEXT:    vmovd %xmm15, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512-NEXT:    vmovd %xmm11, %eax
+; AVX512-NEXT:    vmovd %xmm7, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-NEXT:    vmovd %xmm3, %eax
+; AVX512-NEXT:    vmovd %xmm4, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpeqw %xmm0, %xmm2, %xmm3
+; AVX512-NEXT:    vpblendvb %xmm3, %xmm2, %xmm6, %xmm2
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm3, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm3, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
+; AVX512-NEXT:    vmovd %xmm12, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vmovd %xmm8, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm1
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovel %ecx, %edx
+; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %esi
+; AVX512-NEXT:    cmovel %ecx, %esi
+; AVX512-NEXT:    vcvtph2ps %xmm13, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %edi
+; AVX512-NEXT:    cmovel %ecx, %edi
+; AVX512-NEXT:    vcvtph2ps %xmm14, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r8d
+; AVX512-NEXT:    cmovel %ecx, %r8d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r9d
+; AVX512-NEXT:    cmovel %ecx, %r9d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r10d
+; AVX512-NEXT:    cmovel %ecx, %r10d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r11d
+; AVX512-NEXT:    cmovel %ecx, %r11d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    vmovd %esi, %xmm1
+; AVX512-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $5, %r10d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $6, %r11d, %xmm1, %xmm1
+; AVX512-NEXT:    cmovel %ecx, %eax
+; AVX512-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm0, %xmm6, %xmm0
+; AVX512-NEXT:    addq $88, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4f16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxph $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4f16:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $164, %esp
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrlq $48, %xmm1, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrld $16, %xmm2, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpextrw $0, %xmm1, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm2, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB33_3
+; X86-NEXT:  .LBB33_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB33_3:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_6
+; X86-NEXT:  .LBB33_4:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_6:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_7
+; X86-NEXT:  # %bb.8:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_9
+; X86-NEXT:  .LBB33_7:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_9:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_10
+; X86-NEXT:  # %bb.11:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_12
+; X86-NEXT:  .LBB33_10:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_12:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovd %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; X86-NEXT:    addl $164, %esp
+; X86-NEXT:    retl
+  %r = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
+  ret <4 x half> %r
+}
+
+define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_v4bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $56, %rsp
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pextrw $0, %xmm1, %r14d
+; SSE2-NEXT:    pextrw $0, %xmm0, %r15d
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %ecx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    testl %ecx, %ecx
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    js .LBB34_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_2:
+; SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1,1,1]
+; SSE2-NEXT:    movdqa %xmm5, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1,1,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    cmpordss %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm6
+; SSE2-NEXT:    andps %xmm1, %xmm6
+; SSE2-NEXT:    js .LBB34_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_4:
+; SSE2-NEXT:    pextrw $0, %xmm4, %ebp
+; SSE2-NEXT:    pextrw $0, %xmm5, %ebx
+; SSE2-NEXT:    maxss %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm6, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %r15d
+; SSE2-NEXT:    movd %r15d, %xmm3
+; SSE2-NEXT:    shll $16, %r14d
+; SSE2-NEXT:    movd %r14d, %xmm2
+; SSE2-NEXT:    testl %r15d, %r15d
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    js .LBB34_6
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_6:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm5
+; SSE2-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    cmpordss %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm1, %xmm4
+; SSE2-NEXT:    js .LBB34_8
+; SSE2-NEXT:  # %bb.7:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_8:
+; SSE2-NEXT:    pextrw $0, %xmm5, %r15d
+; SSE2-NEXT:    pextrw $0, %xmm6, %r14d
+; SSE2-NEXT:    maxss %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd %ebx, %xmm1
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movd %ebp, %xmm3
+; SSE2-NEXT:    testl %ebx, %ebx
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    js .LBB34_10
+; SSE2-NEXT:  # %bb.9:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_10:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    js .LBB34_12
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB34_12:
+; SSE2-NEXT:    maxss %xmm3, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %r14d
+; SSE2-NEXT:    movd %r14d, %xmm1
+; SSE2-NEXT:    shll $16, %r15d
+; SSE2-NEXT:    movd %r15d, %xmm3
+; SSE2-NEXT:    testl %r14d, %r14d
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    js .LBB34_14
+; SSE2-NEXT:  # %bb.13:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_14:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    js .LBB34_16
+; SSE2-NEXT:  # %bb.15:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB34_16:
+; SSE2-NEXT:    maxss %xmm3, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $56, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4bf16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    subq $56, %rsp
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $48, %xmm1, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpextrw $0, %xmm4, %ebx
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpextrw $0, %xmm4, %ebp
+; AVX1-NEXT:    vpextrw $0, %xmm0, %r12d
+; AVX1-NEXT:    vpextrw $0, %xmm1, %r13d
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    shll $16, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm4
+; AVX1-NEXT:    js .LBB34_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm4, %xmm1
+; AVX1-NEXT:    jmp .LBB34_3
+; AVX1-NEXT:  .LBB34_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm4, %xmm0
+; AVX1-NEXT:  .LBB34_3:
+; AVX1-NEXT:    vpextrw $0, %xmm2, %r14d
+; AVX1-NEXT:    vpextrw $0, %xmm3, %r15d
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %r13d
+; AVX1-NEXT:    vmovd %r13d, %xmm0
+; AVX1-NEXT:    shll $16, %r12d
+; AVX1-NEXT:    vmovd %r12d, %xmm2
+; AVX1-NEXT:    js .LBB34_4
+; AVX1-NEXT:  # %bb.5:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_6
+; AVX1-NEXT:  .LBB34_4:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_6:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %ebp
+; AVX1-NEXT:    vmovd %ebp, %xmm0
+; AVX1-NEXT:    shll $16, %ebx
+; AVX1-NEXT:    vmovd %ebx, %xmm2
+; AVX1-NEXT:    js .LBB34_7
+; AVX1-NEXT:  # %bb.8:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_9
+; AVX1-NEXT:  .LBB34_7:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_9:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %r15d
+; AVX1-NEXT:    vmovd %r15d, %xmm0
+; AVX1-NEXT:    shll $16, %r14d
+; AVX1-NEXT:    vmovd %r14d, %xmm2
+; AVX1-NEXT:    js .LBB34_10
+; AVX1-NEXT:  # %bb.11:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_12
+; AVX1-NEXT:  .LBB34_10:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_12:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; AVX1-NEXT:    addq $56, %rsp
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4bf16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    vmovq %xmm1, %r13
+; AVX512-NEXT:    movq %r13, %rbx
+; AVX512-NEXT:    shrq $32, %rbx
+; AVX512-NEXT:    vmovq %xmm0, %rbp
+; AVX512-NEXT:    movq %rbp, %r14
+; AVX512-NEXT:    shrq $32, %r14
+; AVX512-NEXT:    movq %r13, %r15
+; AVX512-NEXT:    shrq $48, %r15
+; AVX512-NEXT:    movq %rbp, %r12
+; AVX512-NEXT:    shrq $48, %r12
+; AVX512-NEXT:    movl %ebp, %eax
+; AVX512-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; AVX512-NEXT:    sets %cl
+; AVX512-NEXT:    kmovw %ecx, %k1
+; AVX512-NEXT:    movl %r13d, %ecx
+; AVX512-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
+; AVX512-NEXT:    vmovd %ecx, %xmm1
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    shll $16, %ebp
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %r13d
+; AVX512-NEXT:    vmovd %r13d, %xmm1
+; AVX512-NEXT:    vmovd %ebp, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsp)
+; AVX512-NEXT:    shll $16, %r12d
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %r15d
+; AVX512-NEXT:    vmovd %r15d, %xmm1
+; AVX512-NEXT:    vmovd %r12d, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    shll $16, %r14d
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovd %ebx, %xmm1
+; AVX512-NEXT:    vmovd %r14d, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512-NEXT:    addq $8, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4bf16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxnepbf16 $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4bf16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $68, %esp
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm2
+; X86-NEXT:    vpsrlq $48, %xmm1, %xmm3
+; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm4, %esi
+; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm4, %ebx
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    vpextrw $0, %xmm1, %ecx
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %edx
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %edi
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    vmovd %edi, %xmm0
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    vmovd %edx, %xmm4
+; X86-NEXT:    js .LBB34_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm4, %xmm1
+; X86-NEXT:    jmp .LBB34_3
+; X86-NEXT:  .LBB34_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm4, %xmm0
+; X86-NEXT:  .LBB34_3:
+; X86-NEXT:    vpextrw $0, %xmm2, %edi
+; X86-NEXT:    vpextrw $0, %xmm3, %ebp
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm2
+; X86-NEXT:    js .LBB34_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_6
+; X86-NEXT:  .LBB34_4:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_6:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ebx
+; X86-NEXT:    vmovd %ebx, %xmm0
+; X86-NEXT:    shll $16, %esi
+; X86-NEXT:    vmovd %esi, %xmm2
+; X86-NEXT:    js .LBB34_7
+; X86-NEXT:  # %bb.8:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_9
+; X86-NEXT:  .LBB34_7:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_9:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ebp
+; X86-NEXT:    vmovd %ebp, %xmm0
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    vmovd %edi, %xmm2
+; X86-NEXT:    js .LBB34_10
+; X86-NEXT:  # %bb.11:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_12
+; X86-NEXT:  .LBB34_10:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_12:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovd %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; X86-NEXT:    addl $68, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+  ret <4 x bfloat> %r
+}
diff --git a/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll b/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
index 9543238..55d9ea9 100644
--- a/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7
 
-define void @autogen_SD13708(i32) {
+define void @autogen_SD13708(i32, i1 %arg) {
 BB:
  %Shuff7 = shufflevector <8 x i32> zeroinitializer, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 undef, i32 2, i32 4, i32 undef>
  br label %CF
@@ -8,11 +8,11 @@ BB:
 CF:
  %Tr = trunc <8 x i64> zeroinitializer to <8 x i32>
  %Shuff20 = shufflevector <8 x i32> %Shuff7, <8 x i32> %Tr, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 11>
- br i1 undef, label %CF, label %CF247
+ br i1 %arg, label %CF, label %CF247
 
 CF247:
  %I171 = insertelement <8 x i32> %Shuff20, i32 %0, i32 0
- br i1 undef, label %CF, label %CF247
+ br i1 %arg, label %CF, label %CF247
 }
 
 define void @autogen_SD13800(ptr, ptr, ptr, i32, i64, i8) {
diff --git a/llvm/test/CodeGen/X86/hoist-spill.ll b/llvm/test/CodeGen/X86/hoist-spill.ll
index d11b666..b51609c 100644
--- a/llvm/test/CodeGen/X86/hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/hoist-spill.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @d = external global ptr, align 8
 
 ; Function Attrs: norecurse noreturn nounwind uwtable
-define void @fn1(i32 %p1, i32 %p2, i64 %p3) {
+define void @fn1(i32 %p1, i32 %p2, i64 %p3, i1 %arg) {
 entry:
   %tmp = load ptr, ptr @d, align 8
   %tmp1 = load ptr, ptr @a, align 8
@@ -54,10 +54,10 @@ for.cond4.preheader:                              ; preds = %for.body, %for.cond
   br i1 %cmp528, label %for.inc14, label %for.body6.preheader
 
 for.body6.preheader:                              ; preds = %for.cond4.preheader
-  br i1 undef, label %for.body6, label %min.iters.checked
+  br i1 %arg, label %for.body6, label %min.iters.checked
 
 min.iters.checked:                                ; preds = %for.body6.preheader
-  br i1 undef, label %for.body6, label %vector.memcheck
+  br i1 %arg, label %for.body6, label %vector.memcheck
 
 vector.memcheck:                                  ; preds = %min.iters.checked
   %bound1 = icmp ule ptr undef, %scevgep41
@@ -85,10 +85,10 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp16 = getelementptr inbounds i32, ptr %tmp1, i64 %offset.idx.1
   store <4 x i32> %wide.load.1, ptr %tmp16, align 4
   %index.next.3 = add i64 %index, 32
-  br i1 undef, label %middle.block, label %vector.body
+  br i1 %arg, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body, %vector.body.preheader.split
-  br i1 undef, label %for.inc14, label %for.body6
+  br i1 %arg, label %for.inc14, label %for.body6
 
 for.body.preheader:                               ; preds = %for.cond
   br label %for.body
@@ -98,7 +98,7 @@ for.body:                                         ; preds = %for.body, %for.body
   %add = add nsw i32 %k.127, 1
   %tmp18 = load i32, ptr undef, align 4
   store i32 %tmp18, ptr @b, align 4
-  br i1 undef, label %for.body, label %for.cond4.preheader
+  br i1 %arg, label %for.body, label %for.cond4.preheader
 
 for.body6:                                        ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
   %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
diff --git a/llvm/test/CodeGen/X86/implicit-null-checks.mir b/llvm/test/CodeGen/X86/implicit-null-checks.mir
index 0077906..c98019c 100644
--- a/llvm/test/CodeGen/X86/implicit-null-checks.mir
+++ b/llvm/test/CodeGen/X86/implicit-null-checks.mir
@@ -5,15 +5,15 @@
   target triple = "x86_64-apple-macosx"
 
   ;; Positive test
-  define i32 @imp_null_check_with_bitwise_op_0(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_0(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -24,15 +24,15 @@
 
   ;; Negative test.  The regalloc is such that we cannot hoist the
   ;; instruction materializing 2200000 into $eax
-  define i32 @imp_null_check_with_bitwise_op_1(ptr %x, i32 %val, ptr %ptr) {
+  define i32 @imp_null_check_with_bitwise_op_1(ptr %x, i32 %val, ptr %ptr, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 undef
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -43,15 +43,15 @@
 
   ;; Negative test: IR is identical to
   ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
-  define i32 @imp_null_check_with_bitwise_op_2(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_2(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -62,15 +62,15 @@
 
   ;; Negative test: IR is identical to
   ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
-  define i32 @imp_null_check_with_bitwise_op_3(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_3(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -80,15 +80,15 @@
   }
 
   ;; Positive test
-  define i32 @imp_null_check_with_bitwise_op_4(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_4(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
diff --git a/llvm/test/CodeGen/X86/interval-update-remat.ll b/llvm/test/CodeGen/X86/interval-update-remat.ll
index 44d3db3..91fde2b 100644
--- a/llvm/test/CodeGen/X86/interval-update-remat.ll
+++ b/llvm/test/CodeGen/X86/interval-update-remat.ll
@@ -17,13 +17,13 @@ target triple = "i386-unknown-linux-gnu"
 @f = external global i16, align 2
 @.str = external unnamed_addr constant [12 x i8], align 1
 
-define void @fn1() {
+define void @fn1(i1 %arg) {
 entry:
   %tmp = load i64, ptr @b, align 8
   %or = or i64 0, 3299921317
   %and = and i64 %or, %tmp
   %tmp1 = load i32, ptr @d, align 4
-  br i1 undef, label %lor.rhs, label %lor.end
+  br i1 %arg, label %lor.rhs, label %lor.end
 
 lor.rhs:                                          ; preds = %entry
   %tobool3 = icmp ne i8 undef, 0
@@ -32,7 +32,7 @@ lor.rhs:                                          ; preds = %entry
 lor.end:                                          ; preds = %lor.rhs, %entry
   %lor.ext = zext i1 undef to i32
   %tmp2 = load i64, ptr @e, align 8
-  br i1 undef, label %lor.rhs5, label %lor.end7
+  br i1 %arg, label %lor.rhs5, label %lor.end7
 
 lor.rhs5:                                         ; preds = %lor.end
   br label %lor.end7
diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
index 9eaa654..6dc0427 100644
--- a/llvm/test/CodeGen/X86/jump_sign.ll
+++ b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -249,16 +249,16 @@ define void @func_o() nounwind uwtable {
 ; CHECK-NEXT:  .LBB12_7: # %if.else.i97
 entry:
   %0 = load i16, ptr undef, align 2
-  br i1 undef, label %if.then.i, label %if.end.i
+  br i1 poison, label %if.then.i, label %if.end.i
 
 if.then.i:                                        ; preds = %entry
   unreachable
 
 if.end.i:                                         ; preds = %entry
-  br i1 undef, label %sw.bb, label %sw.default
+  br i1 poison, label %sw.bb, label %sw.default
 
 sw.bb:                                            ; preds = %if.end.i
-  br i1 undef, label %if.then44, label %if.end29
+  br i1 poison, label %if.then44, label %if.end29
 
 if.end29:                                         ; preds = %sw.bb
   %1 = urem i16 %0, 10
@@ -267,7 +267,7 @@ if.end29:                                         ; preds = %sw.bb
   br i1 %cmp25, label %if.then44, label %sw.default
 
 sw.default:                                       ; preds = %if.end29, %if.end.i
-  br i1 undef, label %if.then.i96, label %if.else.i97
+  br i1 poison, label %if.then.i96, label %if.else.i97
 
 if.then.i96:                                      ; preds = %sw.default
   unreachable
@@ -277,7 +277,7 @@ if.else.i97:                                      ; preds = %sw.default
 
 if.then44:                                        ; preds = %if.end29, %sw.bb
   %aModeRefSel.1.ph = phi i16 [ %., %if.end29 ], [ 3, %sw.bb ]
-  br i1 undef, label %if.then.i103, label %if.else.i104
+  br i1 poison, label %if.then.i103, label %if.else.i104
 
 if.then.i103:                                     ; preds = %if.then44
   unreachable
@@ -420,4 +420,3 @@ if.end:
 }
 
 !1 = !{!"branch_weights", i32 2, i32 1}
-
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
index a004333..9cd7551 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
@@ -7,7 +7,7 @@
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.12.0"
 
-define void @foo() {
+define void @foo(i1 %arg) {
 entry:
   br label %for
 
@@ -17,7 +17,7 @@ for:
   store i32 %next, ptr undef, align 4
   %add = add i64 %0, 9223372036854775807
   %inc = add nsw i32 %next, 1
-  br i1 undef, label %exit, label %for
+  br i1 %arg, label %exit, label %for
 
 exit:
   store i64 %add, ptr undef
diff --git a/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll b/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
index 552999f..cf43441 100644
--- a/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
+++ b/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
@@ -3,7 +3,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @hoge
-define void @hoge() {
+define void @hoge(i1 %arg) {
 bb:
   %tmp = sext i32 undef to i64
   %tmp3 = sub nsw i64 0, %tmp
@@ -21,7 +21,7 @@ bb7:                                              ; preds = %bb7, %bb4
   br i1 true, label %bb11, label %bb7
 
 bb11:                                             ; preds = %bb7
-  br i1 undef, label %bb20, label %bb12
+  br i1 %arg, label %bb20, label %bb12
 
 bb12:                                             ; preds = %bb11
   br label %bb13
diff --git a/llvm/test/CodeGen/X86/lsr-delayed-fold.ll b/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
index efa9331..a35015d 100644
--- a/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
+++ b/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
@@ -30,7 +30,7 @@ bb24:                                             ; preds = %bb21, %bb11
 ; ScalarEvolution should be able to correctly expand the crazy addrec here.
 ; PR6914
 
-define void @int323() nounwind {
+define void @int323(i1 %arg) nounwind {
 entry:
   br label %for.cond
 
@@ -38,7 +38,7 @@ for.cond:                                         ; preds = %lbl_264, %for.inc,
   %g_263.tmp.1 = phi i8 [ undef, %entry ], [ %g_263.tmp.1, %for.cond ]
   %p_95.addr.0 = phi i8 [ 0, %entry ], [ %add, %for.cond ]
   %add = add i8 %p_95.addr.0, 1                   ; <i8> [#uses=1]
-  br i1 undef, label %for.cond, label %lbl_264
+  br i1 %arg, label %for.cond, label %lbl_264
 
 lbl_264:                                          ; preds = %if.end, %lbl_264.preheader
   %g_263.tmp.0 = phi i8 [ %g_263.tmp.1, %for.cond ] ; <i8> [#uses=1]
@@ -56,13 +56,13 @@ lbl_264:                                          ; preds = %if.end, %lbl_264.pr
 
 %struct.Bu = type { i32, i32, i32 }
 
-define void @_Z3fooP2Bui(ptr nocapture %bu) {
+define void @_Z3fooP2Bui(ptr nocapture %bu, i1 %arg) {
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.inc131, %entry
   %indvar = phi i64 [ %indvar.next, %for.inc131 ], [ 0, %entry ] ; <i64> [#uses=3]
-  br i1 undef, label %for.inc131, label %lor.lhs.false
+  br i1 %arg, label %for.inc131, label %lor.lhs.false
 
 lor.lhs.false:                                    ; preds = %for.body
   %tmp15 = add i64 %indvar, 1                     ; <i64> [#uses=1]
@@ -123,11 +123,11 @@ for.body123:                                      ; preds = %for.body123, %lor.l
   %add129 = add i32 %mul, %j.03                   ; <i32> [#uses=1]
   tail call void undef(i32 %add129)
   %inc = add nsw i32 %j.03, 1                     ; <i32> [#uses=1]
-  br i1 undef, label %for.inc131, label %for.body123
+  br i1 %arg, label %for.inc131, label %for.body123
 
 for.inc131:                                       ; preds = %for.body123, %for.body
   %indvar.next = add i64 %indvar, 1               ; <i64> [#uses=1]
-  br i1 undef, label %for.end134, label %for.body
+  br i1 %arg, label %for.end134, label %for.body
 
 for.end134:                                       ; preds = %for.inc131
   ret void
@@ -138,14 +138,14 @@ for.end134:                                       ; preds = %for.inc131
 ; require insert point adjustment.
 ; PR7306
 
-define fastcc i32 @GetOptimum() nounwind {
+define fastcc i32 @GetOptimum(i1 %arg) nounwind {
 bb:
   br label %bb1
 
 bb1:                                              ; preds = %bb1, %bb
   %t = phi i32 [ 0, %bb ], [ %t2, %bb1 ]      ; <i32> [#uses=1]
   %t2 = add i32 %t, undef                     ; <i32> [#uses=3]
-  br i1 undef, label %bb1, label %bb3
+  br i1 %arg, label %bb1, label %bb3
 
 bb3:                                              ; preds = %bb1
   %t4 = add i32 undef, -1                       ; <i32> [#uses=1]
@@ -155,13 +155,13 @@ bb5:                                              ; preds = %bb16, %bb3
   %t6 = phi i32 [ %t17, %bb16 ], [ 0, %bb3 ]  ; <i32> [#uses=3]
   %t7 = add i32 undef, %t6                    ; <i32> [#uses=2]
   %t8 = add i32 %t4, %t6                    ; <i32> [#uses=1]
-  br i1 undef, label %bb9, label %bb10
+  br i1 %arg, label %bb9, label %bb10
 
 bb9:                                              ; preds = %bb5
   br label %bb10
 
 bb10:                                             ; preds = %bb9, %bb5
-  br i1 undef, label %bb11, label %bb16
+  br i1 %arg, label %bb11, label %bb16
 
 bb11:                                             ; preds = %bb10
   %t12 = icmp ugt i32 %t7, %t2              ; <i1> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 5828f06..41eae3c 100644
--- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -52,7 +52,7 @@ define void @PR24199(i32 %a0) {
 entry:
   %i = alloca %struct.A, align 8
   %tobool = icmp ne i32 %a0, 0
-  br i1 undef, label %if.end, label %if.then
+  br i1 poison, label %if.end, label %if.then
 
 if.then:
   br label %if.end
@@ -96,5 +96,3 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 !4 = !DIExpression()
 !5 = !DILocalVariable(name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer)
 !6 = !DILocation(line: 0, scope: !3)
-
-
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index ee5fd78..62935f7 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -193,13 +193,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index a46f9ed..9bbd335 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -179,14 +179,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
 define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -391,14 +391,14 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 define i32 @length8(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 4a9643c..3a16ab6 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index 4e27301..0f817b2 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
 define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
 define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index bdb50f5..35fd373 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 9347e54..f638852 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
index ad9f2a3..4a3f5a6 100644
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -221,13 +221,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 8fe1a58..bb089e5 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -205,14 +205,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
 define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -260,6 +260,36 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
   ret i1 %c
 }
 
+define i1 @length4_le(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: length4_le:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    setbe %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 1
+  ret i1 %c
+}
+
+define i1 @length4_ge(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: length4_ge:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, -1
+  ret i1 %c
+}
+
 define i1 @length4_eq_const(ptr %X) nounwind {
 ; X64-LABEL: length4_eq_const:
 ; X64:       # %bb.0:
@@ -279,13 +309,13 @@ define i32 @length5(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB18_3
+; X64-NEXT:    jne .LBB20_3
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %res_block
+; X64-NEXT:  .LBB20_3: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    sbbl %eax, %eax
@@ -319,7 +349,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB20_3
+; X64-NEXT:    jne .LBB22_3
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
@@ -327,7 +357,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB20_3: # %res_block
+; X64-NEXT:  .LBB22_3: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    sbbl %eax, %eax
@@ -348,7 +378,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB21_2
+; X64-NEXT:    jne .LBB23_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 3(%rdi), %ecx
 ; X64-NEXT:    movl 3(%rsi), %edx
@@ -356,13 +386,13 @@ define i32 @length7(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB21_3
-; X64-NEXT:  .LBB21_2: # %res_block
+; X64-NEXT:    je .LBB23_3
+; X64-NEXT:  .LBB23_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB21_3: # %endblock
+; X64-NEXT:  .LBB23_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
   ret i32 %m
@@ -376,7 +406,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB22_2
+; X64-NEXT:    jne .LBB24_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 3(%rdi), %ecx
 ; X64-NEXT:    movl 3(%rsi), %edx
@@ -384,13 +414,13 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB22_3
-; X64-NEXT:  .LBB22_2: # %res_block
+; X64-NEXT:    je .LBB24_3
+; X64-NEXT:  .LBB24_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB22_3: # %endblock
+; X64-NEXT:  .LBB24_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -417,14 +447,14 @@ define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
 define i32 @length8(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
@@ -524,7 +554,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB31_2
+; X64-NEXT:    jne .LBB33_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
@@ -532,13 +562,13 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB31_3
-; X64-NEXT:  .LBB31_2: # %res_block
+; X64-NEXT:    je .LBB33_3
+; X64-NEXT:  .LBB33_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB31_3: # %endblock
+; X64-NEXT:  .LBB33_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
   ret i32 %m
@@ -582,7 +612,7 @@ define i32 @length15(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB34_2
+; X64-NEXT:    jne .LBB36_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 7(%rdi), %rcx
 ; X64-NEXT:    movq 7(%rsi), %rdx
@@ -590,13 +620,13 @@ define i32 @length15(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB34_3
-; X64-NEXT:  .LBB34_2: # %res_block
+; X64-NEXT:    je .LBB36_3
+; X64-NEXT:  .LBB36_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB34_3: # %endblock
+; X64-NEXT:  .LBB36_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
   ret i32 %m
@@ -610,7 +640,7 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB35_2
+; X64-NEXT:    jne .LBB37_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 7(%rdi), %rcx
 ; X64-NEXT:    movq 7(%rsi), %rdx
@@ -618,13 +648,13 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB35_3
-; X64-NEXT:  .LBB35_2: # %res_block
+; X64-NEXT:    je .LBB37_3
+; X64-NEXT:  .LBB37_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB35_3: # %endblock
+; X64-NEXT:  .LBB37_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -640,20 +670,20 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    movq (%rdi), %rdx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    jne .LBB36_2
+; X64-NEXT:    jne .LBB38_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movabsq $4051322327650219061, %rcx # imm = 0x3839303132333435
 ; X64-NEXT:    movq 7(%rdi), %rdx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    je .LBB36_3
-; X64-NEXT:  .LBB36_2: # %res_block
+; X64-NEXT:    je .LBB38_3
+; X64-NEXT:  .LBB38_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rcx, %rdx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB36_3: # %endblock
+; X64-NEXT:  .LBB38_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
   ret i32 %m
@@ -681,20 +711,20 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    movq (%rdi), %rcx
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    jne .LBB38_2
+; X64-NEXT:    jne .LBB40_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movabsq $4051322327650219061, %rax # imm = 0x3839303132333435
 ; X64-NEXT:    movq 7(%rdi), %rcx
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    je .LBB38_3
-; X64-NEXT:  .LBB38_2: # %res_block
+; X64-NEXT:    je .LBB40_3
+; X64-NEXT:  .LBB40_2: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rax, %rcx
 ; X64-NEXT:    sbbl %edx, %edx
 ; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB38_3: # %endblock
+; X64-NEXT:  .LBB40_3: # %endblock
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
@@ -713,7 +743,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB39_2
+; X64-NEXT:    jne .LBB41_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rcx
 ; X64-NEXT:    movq 8(%rsi), %rdx
@@ -721,13 +751,13 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB39_3
-; X64-NEXT:  .LBB39_2: # %res_block
+; X64-NEXT:    je .LBB41_3
+; X64-NEXT:  .LBB41_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB39_3: # %endblock
+; X64-NEXT:  .LBB41_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
   ret i32 %m
@@ -783,7 +813,7 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB41_2
+; X64-NEXT:    jne .LBB43_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rcx
 ; X64-NEXT:    movq 8(%rsi), %rdx
@@ -791,13 +821,13 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB41_3
-; X64-NEXT:  .LBB41_2: # %res_block
+; X64-NEXT:    je .LBB43_3
+; X64-NEXT:  .LBB43_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB41_3: # %endblock
+; X64-NEXT:  .LBB43_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -814,7 +844,7 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB42_2
+; X64-NEXT:    jne .LBB44_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq 8(%rsi), %rcx
@@ -822,13 +852,13 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB42_3
-; X64-NEXT:  .LBB42_2: # %res_block
+; X64-NEXT:    je .LBB44_3
+; X64-NEXT:  .LBB44_2: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    sbbl %edx, %edx
 ; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB42_3: # %endblock
+; X64-NEXT:  .LBB44_3: # %endblock
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
index a10fbc1..3dba5eb 100644
--- a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
+++ b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
@@ -21,10 +21,10 @@ define void @testfn(ptr nocapture %p) {
 
 ; CHECK-LABEL: testfn_scalar
 ; CHECK: retq
-define void @testfn_scalar(ptr nocapture %j) local_unnamed_addr #0 align 2 {
+define void @testfn_scalar(ptr nocapture %j, i1 %arg) local_unnamed_addr #0 align 2 {
 entry:
   %0 = bitcast i64 undef to <2 x float>
-  br i1 undef, label %if.end, label %if.then
+  br i1 %arg, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
   unreachable
diff --git a/llvm/test/CodeGen/X86/mingw-refptr.ll b/llvm/test/CodeGen/X86/mingw-refptr.ll
index 73f1a98..82a90ab 100644
--- a/llvm/test/CodeGen/X86/mingw-refptr.ll
+++ b/llvm/test/CodeGen/X86/mingw-refptr.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X64
+; RUN: llc < %s -mtriple=x86_64-pc-cygwin | FileCheck %s -check-prefix=CHECK-X64
 ; RUN: llc < %s -mtriple=i686-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X86
 ; RUN: llc < %s -mtriple=i686-w64-mingw32-none-elf | FileCheck %s -check-prefix=CHECK-X86-ELF
 
diff --git a/llvm/test/CodeGen/X86/misched-crash.ll b/llvm/test/CodeGen/X86/misched-crash.ll
index 98818d9..a421faba 100644
--- a/llvm/test/CodeGen/X86/misched-crash.ll
+++ b/llvm/test/CodeGen/X86/misched-crash.ll
@@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx10"
 
 ; This function contains a cmp instruction with two users.
 ; Hoisting the last use requires trimming the EFLAGS live range to the second.
-define void @rdar13353090(ptr %plane, i64 %_x1, i64 %_x2) {
+define void @rdar13353090(ptr %plane, i64 %_x1, i64 %_x2, i1 %arg) {
 entry:
   %cmp = icmp ult i64 %_x1, %_x2
   %cond = select i1 %cmp, i64 %_x1, i64 %_x2
@@ -33,7 +33,7 @@ for.body34.i:                                     ; preds = %for.inc39.i, %if.th
 
 for.inc39.i:                                      ; preds = %for.body34.i
   %inc41.i = add i64 %index.178.i, 1
-  br i1 undef, label %return, label %for.body34.i
+  br i1 %arg, label %return, label %for.body34.i
 
 return:                                           ; preds = %for.inc39.i, %for.body34.i, %land.lhs.true21, %entry
   ret void
diff --git a/llvm/test/CodeGen/X86/pr10475.ll b/llvm/test/CodeGen/X86/pr10475.ll
index 4dd5aab..4275dc2 100644
--- a/llvm/test/CodeGen/X86/pr10475.ll
+++ b/llvm/test/CodeGen/X86/pr10475.ll
@@ -2,19 +2,19 @@
 
 ; No check in a crash test
 
-define void @autogen_262380_1000() {
+define void @autogen_262380_1000(i1 %arg) {
 BB:
   br label %CF79
 
 CF79:                                             ; preds = %CF79, %BB
-  br i1 undef, label %CF79, label %CF84.critedge.critedge
+  br i1 %arg, label %CF79, label %CF84.critedge.critedge
 
 CF84.critedge.critedge:                           ; preds = %CF79
   %L35 = load <8 x i32>, ptr undef
   br label %CF85
 
 CF85:                                             ; preds = %CF85, %CF84.critedge.critedge
-  br i1 undef, label %CF85, label %CF86
+  br i1 %arg, label %CF85, label %CF86
 
 CF86:                                             ; preds = %CF86, %CF85
   %B61 = sub <8 x i32> %L35, zeroinitializer
@@ -23,7 +23,7 @@ CF86:                                             ; preds = %CF86, %CF85
   br i1 %E73, label %CF86, label %CF87
 
 CF87:                                             ; preds = %CF87, %CF86
-  br i1 undef, label %CF87, label %CF88
+  br i1 %arg, label %CF87, label %CF88
 
 CF88:                                             ; preds = %CF87
   ret void
diff --git a/llvm/test/CodeGen/X86/pr107423.ll b/llvm/test/CodeGen/X86/pr107423.ll
new file mode 100644
index 0000000..d5119d4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr107423.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s
+
+define void @PR107423(<64 x i8> %arg, ptr %p0) {
+; CHECK-LABEL: PR107423:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpsllw $8, %xmm2, %xmm2
+; CHECK-NEXT:    vpsllw $8, %xmm1, %xmm3
+; CHECK-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; CHECK-NEXT:    vpaddb %xmm1, %xmm4, %xmm1
+; CHECK-NEXT:    vpaddb %xmm4, %xmm0, %xmm4
+; CHECK-NEXT:    vpsllw $8, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpsllw $8, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqu %xmm0, 16(%rdi)
+; CHECK-NEXT:    vmovdqu %xmm2, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %i3 = bitcast <64 x i8> %arg to <32 x i16>
+  %i4 = shufflevector <32 x i16> %i3, <32 x i16> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %i5 = shl <8 x i16> %i4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i6 = bitcast <8 x i16> %i5 to <16 x i8>
+  %i7 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  %i8 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i9 = shufflevector <64 x i8> %i7, <64 x i8> %i8, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i10 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i11 = shufflevector <64 x i8> %i10, <64 x i8> %i9, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %i12 = bitcast <64 x i8> %i11 to <32 x i16>
+  %i13 = shl <32 x i16> %i12, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i14 = bitcast <32 x i16> %i13 to <64 x i8>
+  %i15 = shufflevector <64 x i8> %i14, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i16 = shufflevector <64 x i8> %i11, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  %i17 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i18 = shufflevector <64 x i8> %i16, <64 x i8> %i17, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i19 = shufflevector <16 x i8> %i15, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i20 = shufflevector <64 x i8> %i19, <64 x i8> %i18, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %i21 = add <64 x i8> %i20, %i11
+  %i22 = bitcast <64 x i8> %i21 to <32 x i16>
+  %i23 = shl <32 x i16> %i22, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i24 = bitcast <32 x i16> %i23 to <64 x i8>
+  %i25 = shufflevector <64 x i8> %i24, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i26 = bitcast <32 x i16> %i23 to <64 x i8>
+  %i28 = shufflevector <64 x i8> %i26, <64 x i8> poison, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  %i32 = shufflevector <64 x i8> %i21, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %i33 = shufflevector <16 x i8> %i25, <16 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i34 = shufflevector <64 x i8> %i32, <64 x i8> %i33, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i35 = shufflevector <16 x i8> %i28, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i36 = shufflevector <64 x i8> %i35, <64 x i8> %i34, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %i37 = add <64 x i8> %i36, %i21
+  %i38 = bitcast <64 x i8> %i37 to <32 x i16>
+  %i39 = shufflevector <32 x i16> %i38, <32 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %i40 = shl <8 x i16> %i39, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i41 = bitcast <8 x i16> %i40 to <16 x i8>
+  %i42 = shufflevector <16 x i8> %i41, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i43 = shufflevector <64 x i8> %i42, <64 x i8> %i37, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %i44 = bitcast <64 x i8> %i43 to <32 x i16>
+  %i45 = shufflevector <32 x i16> %i44, <32 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %i46 = shl <8 x i16> %i45, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i47 = bitcast <8 x i16> %i46 to <16 x i8>
+  %i48 = shufflevector <16 x i8> %i47, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i49 = shufflevector <64 x i8> %i43, <64 x i8> %i48, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>
+  %i50 = shufflevector <64 x i8> %i37, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %i51 = add <32 x i8> %i49, %i50
+  store <32 x i8> %i51, ptr %p0, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr11998.ll b/llvm/test/CodeGen/X86/pr11998.ll
index caaf271..4b93c20 100644
--- a/llvm/test/CodeGen/X86/pr11998.ll
+++ b/llvm/test/CodeGen/X86/pr11998.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-- -mattr=+avx
 
-define void @autogen_51367_5000(i8) {
+define void @autogen_51367_5000(i8, i1 %arg) {
 BB:
   %B = srem i8 55, %0
   %B9 = shl i8 %B, %B
   br label %CF
 
 CF:                                               ; preds = %CF, %BB
-  br i1 undef, label %CF, label %CF403
+  br i1 %arg, label %CF, label %CF403
 
 CF403:                                            ; preds = %CF403, %CF
   %S44 = icmp eq i8 %B9, %0
diff --git a/llvm/test/CodeGen/X86/pr32108.ll b/llvm/test/CodeGen/X86/pr32108.ll
index 32f8a76..a50b9a6 100644
--- a/llvm/test/CodeGen/X86/pr32108.ll
+++ b/llvm/test/CodeGen/X86/pr32108.ll
@@ -13,7 +13,7 @@ BB:
   br label %CF243
 
 CF243:                                            ; preds = %CF243, %BB
-  br i1 undef, label %CF243, label %CF257
+  br i1 poison, label %CF243, label %CF257
 
 CF257:                                            ; preds = %CF243
   %Shuff144 = shufflevector <4 x i1> undef, <4 x i1> %Cmp45, <4 x i32> <i32 undef, i32 undef, i32 5, i32 undef>
diff --git a/llvm/test/CodeGen/X86/pr50254.ll b/llvm/test/CodeGen/X86/pr50254.ll
index 01d261a..95b7ae5 100644
--- a/llvm/test/CodeGen/X86/pr50254.ll
+++ b/llvm/test/CodeGen/X86/pr50254.ll
@@ -37,7 +37,7 @@ entry:
   br label %for.body
 
 for.body:                                         ; preds = %entry
-  br i1 undef, label %for.end, label %for.body.1
+  br i1 poison, label %for.end, label %for.body.1
 
 for.end:                                          ; preds = %for.body
   store i16 %xor1, ptr @d.e, align 4
diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll
index 4ca8ae9..779978b 100644
--- a/llvm/test/CodeGen/X86/pr57673.ll
+++ b/llvm/test/CodeGen/X86/pr57673.ll
@@ -100,7 +100,7 @@ bb_entry:
   br label %bb_8
 
 bb_8:                                             ; preds = %bb_last, %bb_entry
-  br i1 undef, label %bb_last, label %bb_mid
+  br i1 poison, label %bb_last, label %bb_mid
 
 bb_mid:                                           ; preds = %bb_8
   %i4 = getelementptr inbounds %t10, ptr %i1, i64 0, i32 1, i64 32
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index beb42f5..47e5079 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -331,13 +331,13 @@ if.end:
   ]
 
 if.then4:
-  br i1 undef, label %SyTime.exit, label %if.then.i
+  br i1 poison, label %SyTime.exit, label %if.then.i
 
 if.then.i:
   unreachable
 
 SyTime.exit:
-  br i1 undef, label %SyTime.exit2681, label %if.then.i2673
+  br i1 poison, label %SyTime.exit2681, label %if.then.i2673
 
 if.then.i2673:
   unreachable
@@ -349,7 +349,7 @@ land.lhs.true14:
   unreachable
 
 if.end25:
-  br i1 undef, label %SyTime.exit2720, label %if.then.i2712
+  br i1 poison, label %SyTime.exit2720, label %if.then.i2712
 
 if.then.i2712:
   unreachable
@@ -406,7 +406,7 @@ do.end:
   %mul167 = shl i32 %rep.6, 2
   %rep.8 = select i1 %cmp164, i32 %mul167, i32 %rep.6
   %..ch.19 = select i1 false, i32 2, i32 0
-  br i1 undef, label %while.body200, label %while.end1465
+  br i1 poison, label %while.body200, label %while.end1465
 
 while.body200:
   %dec3386.in = phi i32 [ %dec3386, %while.cond197.backedge ], [ %rep.8, %do.end ]
@@ -444,7 +444,7 @@ while.cond1037.preheader:
   br i1 %cmp10393273, label %if.end1070, label %land.rhs1041
 
 while.cond635.preheader:
-  br i1 undef, label %for.body643.us, label %while.cond661
+  br i1 poison, label %for.body643.us, label %while.cond661
 
 for.body643.us:
   br label %for.body643.us
@@ -488,7 +488,7 @@ land.rhs485:
   br i1 %isascii.i.i27763151, label %cond.true.i.i2780, label %cond.false.i.i2782
 
 cond.true.i.i2780:
-  br i1 undef, label %land.lhs.true490, label %lor.rhs500
+  br i1 poison, label %land.lhs.true490, label %lor.rhs500
 
 cond.false.i.i2782:
   unreachable
@@ -499,10 +499,10 @@ land.lhs.true490:
 lor.rhs500:
   ; Make sure spill is hoisted to a cold preheader in outside loop.
   %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
-  br i1 undef, label %land.lhs.true504, label %do.body479.backedge
+  br i1 poison, label %land.lhs.true504, label %do.body479.backedge
 
 land.lhs.true504:
-  br i1 undef, label %do.body479.backedge, label %if.end517
+  br i1 poison, label %do.body479.backedge, label %if.end517
 
 do.body479.backedge:
   %incdec.ptr480 = getelementptr i8, ptr %incdec.ptr4803316, i64 1
@@ -531,10 +531,10 @@ for.cond534:
   br i1 %cmp536, label %for.cond542.preheader, label %for.cond534
 
 for.cond542.preheader:
-  br i1 undef, label %for.body545, label %for.end552
+  br i1 poison, label %for.body545, label %for.end552
 
 for.body545:
-  br i1 undef, label %for.end552, label %for.body545
+  br i1 poison, label %for.end552, label %for.body545
 
 for.end552:
   %s.2.lcssa = phi ptr [ undef, %for.cond542.preheader ], [ %q.4, %for.body545 ]
@@ -554,7 +554,7 @@ while.cond864:
   br label %while.cond864
 
 sw.bb956:
-  br i1 undef, label %if.then959, label %while.cond197.backedge
+  br i1 poison, label %if.then959, label %while.cond197.backedge
 
 if.then959:
   br label %while.cond962
@@ -600,7 +600,7 @@ while.end1465:
   ]
 
 for.cond1480.preheader:
-  br i1 undef, label %for.body1606.lr.ph, label %for.end1609
+  br i1 poison, label %for.body1606.lr.ph, label %for.end1609
 
 if.then1477:
   %p.1.lcssa3539 = phi ptr [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ %line, %while.body200 ]
@@ -614,7 +614,7 @@ for.body1606.lr.ph:
   br label %for.end1609
 
 for.end1609:
-  br i1 undef, label %for.cond1659.preheader, label %land.lhs.true1614
+  br i1 poison, label %for.cond1659.preheader, label %land.lhs.true1614
 
 land.lhs.true1614:
   br label %for.cond1659.preheader
@@ -631,13 +631,13 @@ while.body1703.lr.ph:
   unreachable
 
 while.cond1683.preheader:
-  br i1 undef, label %while.body1691, label %while.end1693
+  br i1 poison, label %while.body1691, label %while.end1693
 
 while.body1679:
   %oldc.43406 = phi i32 [ %inc, %syEchoch.exit3070 ], [ %oldc.1.lcssa, %for.body1664.lr.ph ]
   %3 = load ptr, ptr %echo.i3101, align 8, !tbaa !6
   %call.i3062 = call i32 @fileno(ptr %3)
-  br i1 undef, label %if.then.i3069, label %syEchoch.exit3070
+  br i1 poison, label %if.then.i3069, label %syEchoch.exit3070
 
 if.then.i3069:
   br label %syEchoch.exit3070
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index c9edd3f..cd3d481 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -408,7 +408,7 @@ define dso_local void @PR42880(i32 %t0) {
   %x = ptrtoint ptr %add.ptr.i94 to i32
   %sub2 = sub i32 %x, 0
   %div = sdiv exact i32 %sub2, 24
-  br i1 undef, label %if, label %then
+  br i1 poison, label %if, label %then
 
 then:
   %t1 = xor i32 %div, -1
diff --git a/llvm/test/CodeGen/X86/shuffle-combine-crash.ll b/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
index e10e3dd..962b833 100644
--- a/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
+++ b/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
@@ -28,7 +28,7 @@ define void @sample_test() {
 ; CHECK-NEXT:    movd %xmm0, (%rax)
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    retq
-  br i1 undef, label %5, label %1
+  br i1 poison, label %5, label %1
 
 ; <label>:1                                       ; preds = %0
   %2 = load <4 x i8>, ptr undef
@@ -40,4 +40,3 @@ define void @sample_test() {
 ; <label>:5                                       ; preds = %1, %0
   ret void
 }
-
diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll
index 33180a7..72406aa 100644
--- a/llvm/test/CodeGen/X86/stackmap.ll
+++ b/llvm/test/CodeGen/X86/stackmap.ll
@@ -379,23 +379,23 @@ entry:
 ; CHECK-NEXT:   .short 6
 ; CHECK-NEXT:   .short 0
 ; CHECK-NEXT:   .long
-define void @spillSubReg(i64 %arg) #0 {
+define void @spillSubReg(i64 %arg, i1 %arg2) #0 {
 bb:
-  br i1 undef, label %bb1, label %bb2
+  br i1 %arg2, label %bb1, label %bb2
 
 bb1:
   unreachable
 
 bb2:
   %tmp = load i64, ptr inttoptr (i64 140685446136880 to ptr)
-  br i1 undef, label %bb16, label %bb17
+  br i1 %arg2, label %bb16, label %bb17
 
 bb16:
   unreachable
 
 bb17:
   %tmp32 = trunc i64 %tmp to i32
-  br i1 undef, label %bb60, label %bb61
+  br i1 %arg2, label %bb60, label %bb61
 
 bb60:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 1489b02..77b1ac0 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -1014,7 +1014,7 @@ define void @swifterror_isel(ptr) {
 ; CHECK-i386-NEXT:    retl
 entry:
   %swifterror = alloca swifterror ptr, align 8
-  br i1 undef, label %5, label %1
+  br i1 poison, label %5, label %1
 
   %2 = phi i16 [ %4, %1 ], [ undef, %entry ]
   %3 = call i1 undef(i16 %2, ptr swiftself %0, ptr nocapture swifterror %swifterror)
diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll
index 629ba48..c75819c 100644
--- a/llvm/test/CodeGen/X86/switch.ll
+++ b/llvm/test/CodeGen/X86/switch.ll
@@ -2563,7 +2563,7 @@ define i32 @pr27135(i32 %i) {
 ; NOOPT-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; NOOPT-NEXT:    retq
 entry:
-  br i1 undef, label %sw, label %end
+  br i1 poison, label %sw, label %end
 sw:
   switch i32 %i, label %end [
     i32 99,  label %sw.bb
diff --git a/llvm/test/CodeGen/X86/tail-merge-unreachable.ll b/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
index ce5613f..9afdabd 100644
--- a/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
+++ b/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s
 
-define i32 @tail_merge_unreachable(i32 %i) {
+define i32 @tail_merge_unreachable(i32 %i, i1 %arg) {
 entry:
-  br i1 undef, label %sw, label %end
+  br i1 %arg, label %sw, label %end
 sw:
   switch i32 %i, label %end [
     i32 99,  label %sw.bb
diff --git a/llvm/test/CodeGen/X86/uint_to_half.ll b/llvm/test/CodeGen/X86/uint_to_half.ll
new file mode 100644
index 0000000..b62a07e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/uint_to_half.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefixes=AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+f16c | FileCheck %s -check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefixes=AVX512
+
+define <8 x half> @test_uitofp_v8i32_v8f16(<8 x i32> %a) {
+; AVX1-LABEL: test_uitofp_v8i32_v8f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v8i32_v8f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_uitofp_v8i32_v8f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = uitofp <8 x i32> %a to <8 x half>
+  ret <8 x half> %vec
+}
+
+define <8 x half> @test_strict_uitofp_v8i32_v8f16(<8 x i32> %a) {
+; AVX1-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <8 x half> %vec
+}
+
+define <16 x half> @test_uitofp_v16i32_v16f16(<16 x i32> %a) {
+; AVX1-LABEL: test_uitofp_v16i32_v16f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v16i32_v16f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vsubps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_uitofp_v16i32_v16f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %vec = uitofp <16 x i32> %a to <16 x half>
+  ret <16 x half> %vec
+}
+
+define <16 x half> @test_strict_uitofp_v16i32_v16f16(<16 x i32> %a) {
+; AVX1-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vsubps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <16 x half> %vec
+}
diff --git a/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll b/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
index d784425..b09e202 100644
--- a/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
+++ b/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
@@ -7,9 +7,9 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define double @fn1(ptr %arg, i64 %arg1) {
+define double @fn1(ptr %arg, i64 %arg1, i1 %arg2) {
 Entry:
-  br i1 undef, label %Body, label %Exit
+  br i1 %arg2, label %Body, label %Exit
 
 Exit:                                             ; preds = %Brancher7, %Entry
   ret double undef
diff --git a/llvm/test/CodeGen/X86/update-terminator.mir b/llvm/test/CodeGen/X86/update-terminator.mir
index d26f7975..ff5df9a 100644
--- a/llvm/test/CodeGen/X86/update-terminator.mir
+++ b/llvm/test/CodeGen/X86/update-terminator.mir
@@ -10,14 +10,14 @@
   declare void @dummy3()
 
   ; Function Attrs: nounwind
-  define void @f2() {
-    br i1 undef, label %bb1, label %bb3
+  define void @f2(i1 %arg) {
+    br i1 %arg, label %bb1, label %bb3
 
   bb1:
     call void @dummy1()
     call void @dummy1()
     call void @dummy1()
-    br i1 undef, label %bb2, label %bb2
+    br i1 %arg, label %bb2, label %bb2
 
   bb2:
     call void @dummy2()
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 460c5fe..78dd2cf 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -517,7 +517,7 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
@@ -647,7 +647,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2
 ; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
@@ -993,7 +993,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
 ; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
 ; AVX512-NEXT:    movw %ax, 9(%rdi)
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index d06993d..746c09e 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -522,7 +522,7 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
@@ -652,7 +652,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
@@ -1010,7 +1010,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
 ; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
 ; AVX512-NEXT:    movw %ax, 9(%rdi)
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index bac1180..be7888c 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -604,7 +604,7 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpltud %zmm0, %zmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
@@ -730,7 +730,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpltub %xmm0, %xmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
@@ -1046,7 +1046,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
 ; AVX512-NEXT:    movw %ax, 9(%rdi)
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index ab75ada..ceb1ad1 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -647,7 +647,7 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpnleud %zmm0, %zmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
@@ -773,7 +773,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpnleub %xmm0, %xmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
@@ -1093,7 +1093,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
 ; AVX512-NEXT:    movw %ax, 9(%rdi)
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 4f42d5c..15e287d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4129,6 +4129,62 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
   ret <32 x i8> %shuffle
 }
 
+; PR121823
+define <32 x i8> @shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a)  {
+; AVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,9,0,3]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],zero,zero,zero,zero
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,1,2,4,5,6,14,15]
+; AVX512VLBW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,9,0,3,11,2,5,13,4,7,15,6,17,25,16,19,27,18,21,29,20,23,31,22,56,57,58,59,60,61,62,63]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],xmm1[1,9,0,3]
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; XOPAVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0]
+; XOPAVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; XOPAVX2-NEXT:    retq
+  %r = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 0, i32 3, i32 11, i32 2, i32 5, i32 13, i32 4, i32 7, i32 15, i32 6, i32 17, i32 25, i32 16, i32 19, i32 27, i32 18, i32 21, i32 29, i32 20, i32 23, i32 31, i32 22, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48>
+  ret <32 x i8> %r
+}
+
 define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index f0b70ae..4125d78 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -190,7 +190,7 @@ define i64 @PR55050() {
 entry:
   %i275 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer)
   %i277 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer)
-  br i1 undef, label %exit, label %if
+  br i1 poison, label %exit, label %if
 
 if:
   %i298 = bitcast <2 x i64> %i275 to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index fe7459e..928f29b 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -868,7 +868,7 @@ define void @infiniteloop() {
 ; DISABLE-NEXT:    popq %rbp
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %if.then, label %if.end
+  br i1 poison, label %if.then, label %if.end
 
 if.then:
   %ptr = alloca i32, i32 4
@@ -983,7 +983,7 @@ define void @infiniteloop2() {
 ; DISABLE-NEXT:    popq %rbp
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %if.then, label %if.end
+  br i1 poison, label %if.then, label %if.end
 
 if.then:
   %ptr = alloca i32, i32 4
@@ -994,7 +994,7 @@ for.body:                                         ; preds = %for.body, %entry
   %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
   %add = add nsw i32 %call, %sum.03
   store i32 %add, ptr %ptr
-  br i1 undef, label %body1, label %body2
+  br i1 poison, label %body1, label %body2
 
 body1:
   tail call void asm sideeffect "nop", "~{ebx}"()
@@ -1074,10 +1074,10 @@ define void @infiniteloop3() {
 ; DISABLE-NEXT:  LBB12_7: ## %end
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %loop2a, label %body
+  br i1 poison, label %loop2a, label %body
 
 body:                                             ; preds = %entry
-  br i1 undef, label %loop2a, label %end
+  br i1 poison, label %loop2a, label %end
 
 loop1:                                            ; preds = %loop2a, %loop2b
   %var.phi = phi ptr [ %next.phi, %loop2b ], [ %var, %loop2a ]
diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
index 26ad597..82301e4 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
@@ -4,7 +4,7 @@
 @GLOBAL = addrspace(1) externally_initialized global i32 0, align 4, !dbg !0
 @SHARED = addrspace(3) externally_initialized global i32 undef, align 4, !dbg !6
 
-define void @test(float, ptr, ptr, i32) !dbg !17 {
+define ptx_kernel void @test(float, ptr, ptr, i32) !dbg !17 {
   %5 = alloca float, align 4
   %6 = alloca ptr, align 8
   %7 = alloca ptr, align 8
@@ -38,7 +38,6 @@ define void @test(float, ptr, ptr, i32) !dbg !17 {
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!2}
-!nvvm.annotations = !{!10}
 !llvm.module.flags = !{!11, !12, !13, !14, !15}
 !llvm.ident = !{!16}
 
@@ -52,7 +51,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 !7 = distinct !DIGlobalVariable(name: "SHARED", scope: !2, file: !8, line: 4, type: !9, isLocal: false, isDefinition: true)
 !8 = !DIFile(filename: "test.cu", directory: "/tmp")
 !9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!10 = !{ptr @test, !"kernel", i32 1}
 !11 = !{i32 2, !"Dwarf Version", i32 2}
 !12 = !{i32 2, !"Debug Info Version", i32 3}
 !13 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index 55c81ca..c926229 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -59,7 +59,7 @@
 ; CHECK: }
 
 ; Function Attrs: nounwind
-define void @_Z5saxpyifPfS_(i32 %n, float %a, ptr nocapture readonly %x, ptr nocapture %y) local_unnamed_addr #0 !dbg !566 {
+define ptx_kernel void @_Z5saxpyifPfS_(i32 %n, float %a, ptr nocapture readonly %x, ptr nocapture %y) local_unnamed_addr #0 !dbg !566 {
 entry:
   call void @llvm.dbg.value(metadata i32 %n, metadata !570, metadata !DIExpression()), !dbg !575
   call void @llvm.dbg.value(metadata float %a, metadata !571, metadata !DIExpression()), !dbg !576
@@ -8496,7 +8496,6 @@ attributes #2 = { nounwind readnone speculatable }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!nvvm.annotations = !{!555, !556, !557, !556, !558, !558, !558, !558, !559, !559, !558}
 !llvm.module.flags = !{!560, !561, !562, !563}
 !llvm.ident = !{!564}
 !nvvm.internalize.after.link = !{}
@@ -9057,11 +9056,6 @@ attributes #3 = { nounwind }
 !552 = !DISubprogram(name: "tgammaf", linkageName: "_ZL7tgammaff", scope: !444, file: !444, line: 1592, type: !13, isLocal: true, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true)
 !553 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !5, entity: !554, file: !445, line: 459)
 !554 = !DISubprogram(name: "truncf", linkageName: "_ZL6truncff", scope: !462, file: !462, line: 662, type: !13, isLocal: true, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true)
-!555 = !{ptr @_Z5saxpyifPfS_, !"kernel", i32 1}
-!556 = !{null, !"align", i32 8}
-!557 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!558 = !{null, !"align", i32 16}
-!559 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !560 = !{i32 2, !"Dwarf Version", i32 2}
 !561 = !{i32 2, !"Debug Info Version", i32 3}
 !562 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll b/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll
new file mode 100644
index 0000000..a836b2a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll
@@ -0,0 +1,83 @@
+; RUN: llc -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump -debug-info -debug-names - \
+; RUN:     | FileCheck %s
+
+;; Test that an entry in the debug names table gets created for a top level DIE when the creation of TU fails.
+
+;; clang++ -O0 main.cpp -gdwarf-5 -fdebug-types-section -gpubnames -S -emit-llvm -glldb -o main.ll
+;; int foo;
+;; namespace {
+;; struct t1 {};
+;; } // namespace
+;; template <int *> struct t2 {
+;;   t1 v1;
+;; };
+;; struct t3 {
+;;   t2<&foo> v1;
+;; };
+;; t3 v1;
+
+; CHECK: [[OFFSET:0x[0-9a-f]*]]:   DW_TAG_structure_type
+; CHECK: [[OFFSET1:0x[0-9a-f]*]]:   DW_TAG_structure_type
+
+; CHECK:        Bucket 0 [
+; CHECK-NEXT:    Name 1 {
+; CHECK-NEXT:      Hash: {{.+}}
+; CHECK-NEXT:      String: {{.+}} "t3"
+; CHECK-NEXT:      Entry @ {{.+}} {
+; CHECK-NEXT:        Abbrev: 0x1
+; CHECK-NEXT:        Tag: DW_TAG_structure_type
+; CHECK-NEXT:        DW_IDX_die_offset: [[OFFSET]]
+; CHECK-NEXT:        DW_IDX_parent: <parent not indexed>
+
+; CHECK:        Name 5 {
+; CHECK-NEXT:      Hash: {{.+}}
+; CHECK-NEXT:      String: {{.+}} "t2<&foo>"
+; CHECK-NEXT:      Entry @ 0xe1 {
+; CHECK-NEXT:        Abbrev: 0x1
+; CHECK-NEXT:        Tag: DW_TAG_structure_type
+; CHECK-NEXT:        DW_IDX_die_offset: [[OFFSET1]]
+; CHECK-NEXT:        DW_IDX_parent: <parent not indexed>
+
+; ModuleID = 'main.cpp'
+source_filename = "main.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.t3 = type { i8 }
+
+@foo = dso_local global i32 0, align 4, !dbg !0
+@v1 = dso_local global %struct.t3 zeroinitializer, align 1, !dbg !5
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!20, !21, !22, !23, !24, !25, !26}
+!llvm.ident = !{!27}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !19, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 20.0.0git (git@github.com:llvm/llvm-project.git ba373096e8ac83a7136fc44bc4e71a7bc53417a6)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, sysroot: "/")
+!3 = !DIFile(filename: "main.cpp", directory: "/StructuredType", checksumkind: CSK_MD5, checksum: "f91f8d905197b1c0309da9526bc4776e")
+!4 = !{!0, !5}
+!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!6 = distinct !DIGlobalVariable(name: "v1", scope: !2, file: !3, line: 11, type: !7, isLocal: false, isDefinition: true)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t3", file: !3, line: 8, size: 8, flags: DIFlagTypePassByValue, elements: !8, identifier: "_ZTS2t3")
+!8 = !{!9}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !7, file: !3, line: 9, baseType: !10, size: 8)
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2<&foo>", file: !3, line: 5, size: 8, flags: DIFlagTypePassByValue, elements: !11, templateParams: !16, identifier: "_ZTS2t2IXadL_Z3fooEEE")
+!11 = !{!12}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !10, file: !3, line: 6, baseType: !13, size: 8)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", scope: !14, file: !3, line: 3, size: 8, flags: DIFlagTypePassByValue, elements: !15)
+!14 = !DINamespace(scope: null)
+!15 = !{}
+!16 = !{!17}
+!17 = !DITemplateValueParameter(type: !18, value: ptr @foo)
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !{i32 7, !"Dwarf Version", i32 5}
+!21 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !{i32 1, !"wchar_size", i32 4}
+!23 = !{i32 8, !"PIC Level", i32 2}
+!24 = !{i32 7, !"PIE Level", i32 2}
+!25 = !{i32 7, !"uwtable", i32 2}
+!26 = !{i32 7, !"frame-pointer", i32 2}
+!27 = !{!"clang version 20.0.0git (git@github.com:llvm/llvm-project.git ba373096e8ac83a7136fc44bc4e71a7bc53417a6)"}
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
index b4f6e04..9296f04 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
@@ -1,16 +1,18 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv7.o %s
 # RUN: llvm-objdump -s --section=.rodata %t_armv7.o | FileCheck --check-prefix=CHECK-OBJ %s
-# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
-# RUN:              -slab-page-size 4096 %t_armv7.o -debug-only=jitlink 2>&1 \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_armv7.o 2>&1 \
 # RUN:              | FileCheck --check-prefix=CHECK-LG %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
 # RUN:              -slab-page-size 4096 %t_armv7.o -check %s
 
 # RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7.o %s
 # RUN: llvm-objdump -s --section=.rodata %t_thumbv7.o | FileCheck --check-prefix=CHECK-OBJ %s
-# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
-# RUN:              -slab-page-size 4096 %t_thumbv7.o -debug-only=jitlink 2>&1 \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_thumbv7.o 2>&1 \
 # RUN:              | FileCheck --check-prefix=CHECK-LG %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
 # RUN:              -slab-page-size 4096 %t_thumbv7.o -check %s
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s b/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
index 151a041..b25ffee 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
index 20534d5..b2adb85 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=arm64-apple-ios -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of compact-unwind sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
index 8d43b0f..4e84518 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=arm64-apple-darwin11 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
index 23f6acc3..da9f998 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
@@ -103,6 +103,30 @@ test_gotoffset12_external:
     ld.w $a0, $a0, %got_pc_lo12(external_data)
     .size test_gotoffset12_external, .-test_gotoffset12_external
 
+## Check R_LARCH_B16 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br16, 2)[17:0] = \
+# jitlink-check:   (test_br16_target - test_br16)[17:0]
+    .globl test_br16, test_br16_target
+    .p2align 2
+test_br16:
+    beq $t1, $t2, %b16(test_br16_target)
+    .skip (1 << 16)
+test_br16_target:
+    .size test_br16, .-test_br16
+
+## Check R_LARCH_B21 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br21, 1)[22:0] = \
+# jitlink-check:   (test_br21_target - test_br21)[22:0]
+    .globl test_br21, test_br21_target
+    .p2align 2
+test_br21:
+    beqz $t1, %b21(test_br21_target)
+    .skip (1 << 21)
+test_br21_target:
+    .size test_br21, .-test_br21
+
 
     .globl named_data
     .p2align 4
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
index cc54585..806cdcf 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
-# RUN: llvm-mc --triple=loongarch64-linux-gnu --filetype=obj -o %t %s
-# RUN: llvm-jitlink --noexec --phony-externals --debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-mc -triple=loongarch64-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 
 ## Check that splitting of eh-frame sections works.
 
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
index f07ac54..a390d1b 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
@@ -116,7 +116,6 @@ test_gotoffset12_external:
     ld.d $a0, $a0, %got_pc_lo12(external_data)
     .size test_gotoffset12_external, .-test_gotoffset12_external
 
-
 ## Check R_LARCH_CALL36 relocation of a local function call.
 
 # jitlink-check: decode_operand(local_func_call36, 1)[19:0] = \
@@ -130,6 +129,31 @@ local_func_call36:
     jirl $ra, $ra, 0
     .size local_func_call36, .-local_func_call36
 
+## Check R_LARCH_B16 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br16, 2)[17:0] = \
+# jitlink-check:   (test_br16_target - test_br16)[17:0]
+    .globl test_br16, test_br16_target
+    .p2align 2
+test_br16:
+    beq $t1, $t2, %b16(test_br16_target)
+    .skip (1 << 16)
+test_br16_target:
+    .size test_br16, .-test_br16
+
+## Check R_LARCH_B21 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br21, 1)[22:0] = \
+# jitlink-check:   (test_br21_target - test_br21)[22:0]
+    .globl test_br21, test_br21_target
+    .p2align 2
+test_br21:
+    beqz $t1, %b21(test_br21_target)
+    .skip (1 << 21)
+test_br21_target:
+    .size test_br21, .-test_br21
+
+
     .globl named_data
     .p2align 4
     .type named_data,@object
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
index 480fbb8..2b5c9e3 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
@@ -1,15 +1,15 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax -filetype=obj -o %t.rv32 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32 \
-# RUN:    2>&1 | FileCheck %s
+# RUN:     -check %s %t.rv32 2>&1 \
+# RUN:     | FileCheck %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:    2>&1 | FileCheck %s
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
 
         .text
 
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
index e8a2928..3bbfd55 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
@@ -1,43 +1,43 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax,+c -filetype=obj -o %t.rv32 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32 \
-# RUN:    2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv32 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv32 %t.rv32 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV32 %s
+# RUN:     -check %s -check-name=jitlink-check-rv32 %t.rv32 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax,+c -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:     2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv64 %t.rv64 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV64 %s
+# RUN:     -check %s -check-name=jitlink-check-rv64 %t.rv64 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV64 %s
 
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax,+zca -filetype=obj -o %t.rv32zca %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32zca \
-# RUN:    2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv32zca 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv32 %t.rv32zca \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV32 %s
+# RUN:     -check %s -check-name=jitlink-check-rv32 %t.rv32zca 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax,+c -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:     2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv64 %t.rv64 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV64 %s
+# RUN:     -check %s -check-name=jitlink-check-rv64 %t.rv64 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV64 %s
 
         .text
 
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s
index 13689b6..01f9e7e 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s
@@ -1,6 +1,8 @@
 # RUN: rm -rf %t && mkdir -p %t
-# RUN: llvm-mc -triple=riscv64 -filetype=obj -o %t/riscv64_reloc_add.o %s
-# RUN: llvm-mc -triple=riscv32 -filetype=obj -o %t/riscv32_reloc_add.o %s
+# RUN: llvm-mc -triple=riscv64 -mattr=+relax -filetype=obj \
+# RUN:     -o %t/riscv64_reloc_add.o %s
+# RUN: llvm-mc -triple=riscv32 -mattr=+relax -filetype=obj \
+# RUN:     -o %t/riscv32_reloc_add.o %s
 # RUN: llvm-jitlink -noexec -check %s %t/riscv64_reloc_add.o \
 # RUN:     -slab-allocate=1Mb -slab-address=0x1000 -slab-page-size=0x1000
 # RUN: llvm-jitlink -noexec -check %s %t/riscv32_reloc_add.o \
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s b/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
index e7114e4..a1badfd 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv64 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -debug-only=jitlink -num-threads=0 -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Because of the exist of cfi directive, sections like eh_frame section will be emitted
 # in llvm's object code emission phase. Anonymous symbols will also be emitted to indicate
diff --git a/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s b/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
index 9e9b340..75f09ff 100644
--- a/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
@@ -1,10 +1,12 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=powerpc64le-unknown-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -phony-externals \
+# RUN:              %t 2>&1 \
+# RUN:              | FileCheck %s
 # RUN: llvm-mc -triple=powerpc64-unknown-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -phony-externals \
+# RUN:              %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s b/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
index 0bc9090..7021a27 100644
--- a/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
+++ b/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
@@ -4,8 +4,9 @@
 # RUN:   %t/external_weak.o %S/Inputs/external_weak.s
 # RUN: llvm-mc -triple=powerpc64le-unknown-linux-gnu -filetype=obj -o \
 # RUN:   %t/external_weak_main.o %S/Inputs/external_weak_main.s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t/external_weak.o \
-# RUN:   %t/external_weak_main.o 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              %t/external_weak.o %t/external_weak_main.o 2>&1 \
+# RUN:              | FileCheck %s
 # CHECK: Created ELFLinkGraphBuilder for "{{.*}}external_weak_main.o"
 # CHECK: Creating defined graph symbol for ELF symbol "foo"
 # CHECK: External symbols:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
index 830a2e0..d69dbbd 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check absolute symbol is created with a correct value.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
index 10f1182..b117674 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_ANY selection type.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
index 7dfb4c7..8915d04 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
@@ -1,16 +1,19 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check COMDAT associative symbol is emitted as local symbol.
 #
-# CHECK: Creating graph symbols...
-# CHECK:      2: Creating defined graph symbol for COFF symbol ".text" in .text (index: 2)
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000001, linkage: strong, scope: local, dead  -   <anonymous symbol>
-# CHECK-NEXT: 4: Exporting COMDAT graph symbol for COFF symbol "func" in section 2
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000001, linkage: weak, scope: default, dead  -   func
-# CHECK-NEXT: 5: Creating defined graph symbol for COFF symbol ".xdata" in .xdata (index: 3)
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .xdata
+# CHECK:       Creating graph symbols...
+# CHECK:         0: Creating defined graph symbol for COFF symbol ".text" in .text (index: 1)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .text
+# CHECK-NEXT:    4: Exporting COMDAT graph symbol for COFF symbol "func" in section 2
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: weak, scope: default, dead  -   func
+# CHECK-NEXT:    4: Creating defined graph symbol for COFF symbol "func" in .text (index: 2)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: weak, scope: default, dead  -   func
+# CHECK-NEXT:    5: Creating defined graph symbol for COFF symbol ".xdata" in .xdata (index: 3)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .xdata
 
 --- !COFF
 header:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
index f757271..76a0ac4 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_EXACT_MATCH selection type.
 # Doesn't check the content validation.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
index 11a1825..79f4b15 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a comdat export is done correctly even if second symbol of comdat sequences appear out of order
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
index 86d809d..dc05297 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check jitlink handles largest selection type as plain weak symbol.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
index 53b2c81..0c5313e 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a strong symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_NODUPLICATES selection type.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
index 97467fd..6cd8ff9 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a COMDAT symbol with an offset is handled correctly.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
index ef0f84a..e1d955f 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_SAME_SIZE selection type.
 # Doesn't check the size validation.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
index 79ac75f..8fa8ba0 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a COMDAT any symbol is exported as a weak symbol.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s
index 2754855..01aac02 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s
@@ -8,7 +8,7 @@
 #
 # RUN: not llvm-jitlink -noexec %t/COFF_main.o %t/COFF_weak_1.o %t/COFF_strong.o \
 # RUN:                  -slab-allocate 64Kb -slab-address 0xfff00000 \
-# RUN:                  -slab-page-size 4096 -show-graph 2>&1 | FileCheck %s
+# RUN:                  -slab-page-size 4096 -show-graphs=".*" 2>&1 | FileCheck %s
 #
 # Check that a combination of comdat any definition and strong definition
 # generate duplicate definition error.
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
index 2d4ad30..2788a9b 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a common symbol is created.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
index e929c01..ebce795 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
@@ -1,10 +1,10 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec -abs __ImageBase=0xfff00000 \
-# RUN: --debug-only=jitlink \
-# RUN: -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \
-# RUN: %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs __ImageBase=0xfff00000 -slab-allocate 100Kb \
+# RUN:              -slab-address 0xfff00000 -slab-page-size 4096 %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check duplicate undefined external symbols are handled correctly.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
index 3980f81..ac1ef2d 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs func=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs func=0xcafef00d %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a file debug symbol is skipped.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
index 5275c7d..dce0c1e 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs var=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs var=0xcafef00d  %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a local symbol is created for a static variable.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
index c750d75..d49d561 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs var=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs var=0xcafef00d %t 2>&1 | \
+# RUN:              FileCheck %s
 #
 # Check a default symbol is aliased as a weak external symbol.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
index 0afcda4..09dda47 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: yaml2obj -o %t.o %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t.o 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t.o 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that debug sections get NoAlloc lifetimes.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
index c01ced5..9339f07 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
@@ -2,8 +2,9 @@
 # UNSUPPORTED: system-windows
 # RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent \
 # RUN:     -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -abs bar=0x01 \
-# RUN:     -abs _ZTIi=0x02 -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs bar=0x01 -abs _ZTIi=0x02 %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # FIXME: This test should run on windows. Investigate spurious
 # 'note: command had no output on stdout or stderr' errors, then re-enable.
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
index 64990b5..98fc5f4 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
@@ -2,8 +2,9 @@
 # UNSUPPORTED: system-windows
 # RUN: llvm-mc -triple=x86_64-pc-linux-gnu -large-code-model \
 # RUN:   -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec -phony-externals %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check handling of pointer encodings for personality functions when compiling
 # with `-mcmodel=large -static`.
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
index 139ef14..83d71cd 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
@@ -1,14 +1,15 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=orc -noexec -abs _external_func=0x1 \
-# RUN:   -entry=_foo %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=orc -noexec \
+# RUN:              -abs _external_func=0x1 -entry=_foo %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that simplification eliminates dependencies on symbols in this unit,
 # and correctly propagates dependencies on symbols outside the unit (including
 # via locally scoped symbols). In this test _baz depends on _foo indirectly via
 # the local symbol _bar. Initially we expect _baz to depend on _foo, and _foo
 # on _external_func, after simplification we expect both to depend on
-# _external_func only.	
+# _external_func only.
 
 # CHECK: In main emitting {{.*}}_foo{{.*}}
 # CHECK-NEXT: Initial dependencies:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s
new file mode 100644
index 0000000..81ea18f
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s
@@ -0,0 +1,318 @@
+# RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=orc -noexec -debugger-support \
+# RUN:              %t.o 2>&1 \
+# RUN:              | FileCheck %s
+#
+# REQUIRES: asserts && system-darwin
+#
+# Test that source file names can be indentified from DWARF line tables.
+
+# CHECK: Using FileName = "check-dwarf-filename.c" from DWARF line table
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.build_version macos, 15, 0	sdk_version 15, 0
+	.globl	_main                           ## -- Begin function main
+	.p2align	4, 0x90
+_main:                                  ## @main
+Lfunc_begin0:
+	.file	0 "/Users/lhames/Projects/scratch" "check-dwarf-filename.c" md5 0x331a6c7ae0cfcd2896eca60ac6f5703e
+	.loc	0 1 0                           ## check-dwarf-filename.c:1:0
+	.cfi_startproc
+## %bb.0:
+	##DEBUG_VALUE: main:argc <- $edi
+	##DEBUG_VALUE: main:argv <- $rsi
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+Ltmp0:
+	.loc	0 2 3 prologue_end              ## check-dwarf-filename.c:2:3
+	xorl	%eax, %eax
+	.loc	0 2 3 epilogue_begin is_stmt 0  ## check-dwarf-filename.c:2:3
+	popq	%rbp
+	retq
+Ltmp1:
+Lfunc_end0:
+	.cfi_endproc
+                                        ## -- End function
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                               ## Abbreviation Code
+	.byte	17                              ## DW_TAG_compile_unit
+	.byte	1                               ## DW_CHILDREN_yes
+	.byte	37                              ## DW_AT_producer
+	.byte	37                              ## DW_FORM_strx1
+	.byte	19                              ## DW_AT_language
+	.byte	5                               ## DW_FORM_data2
+	.byte	3                               ## DW_AT_name
+	.byte	37                              ## DW_FORM_strx1
+	.ascii	"\202|"                         ## DW_AT_LLVM_sysroot
+	.byte	37                              ## DW_FORM_strx1
+	.ascii	"\357\177"                      ## DW_AT_APPLE_sdk
+	.byte	37                              ## DW_FORM_strx1
+	.byte	114                             ## DW_AT_str_offsets_base
+	.byte	23                              ## DW_FORM_sec_offset
+	.byte	16                              ## DW_AT_stmt_list
+	.byte	23                              ## DW_FORM_sec_offset
+	.byte	27                              ## DW_AT_comp_dir
+	.byte	37                              ## DW_FORM_strx1
+	.ascii	"\341\177"                      ## DW_AT_APPLE_optimized
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	17                              ## DW_AT_low_pc
+	.byte	27                              ## DW_FORM_addrx
+	.byte	18                              ## DW_AT_high_pc
+	.byte	6                               ## DW_FORM_data4
+	.byte	115                             ## DW_AT_addr_base
+	.byte	23                              ## DW_FORM_sec_offset
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	2                               ## Abbreviation Code
+	.byte	46                              ## DW_TAG_subprogram
+	.byte	1                               ## DW_CHILDREN_yes
+	.byte	17                              ## DW_AT_low_pc
+	.byte	27                              ## DW_FORM_addrx
+	.byte	18                              ## DW_AT_high_pc
+	.byte	6                               ## DW_FORM_data4
+	.byte	64                              ## DW_AT_frame_base
+	.byte	24                              ## DW_FORM_exprloc
+	.byte	122                             ## DW_AT_call_all_calls
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	3                               ## DW_AT_name
+	.byte	37                              ## DW_FORM_strx1
+	.byte	58                              ## DW_AT_decl_file
+	.byte	11                              ## DW_FORM_data1
+	.byte	59                              ## DW_AT_decl_line
+	.byte	11                              ## DW_FORM_data1
+	.byte	39                              ## DW_AT_prototyped
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	73                              ## DW_AT_type
+	.byte	19                              ## DW_FORM_ref4
+	.byte	63                              ## DW_AT_external
+	.byte	25                              ## DW_FORM_flag_present
+	.ascii	"\341\177"                      ## DW_AT_APPLE_optimized
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	3                               ## Abbreviation Code
+	.byte	5                               ## DW_TAG_formal_parameter
+	.byte	0                               ## DW_CHILDREN_no
+	.byte	2                               ## DW_AT_location
+	.byte	24                              ## DW_FORM_exprloc
+	.byte	3                               ## DW_AT_name
+	.byte	37                              ## DW_FORM_strx1
+	.byte	58                              ## DW_AT_decl_file
+	.byte	11                              ## DW_FORM_data1
+	.byte	59                              ## DW_AT_decl_line
+	.byte	11                              ## DW_FORM_data1
+	.byte	73                              ## DW_AT_type
+	.byte	19                              ## DW_FORM_ref4
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	4                               ## Abbreviation Code
+	.byte	36                              ## DW_TAG_base_type
+	.byte	0                               ## DW_CHILDREN_no
+	.byte	3                               ## DW_AT_name
+	.byte	37                              ## DW_FORM_strx1
+	.byte	62                              ## DW_AT_encoding
+	.byte	11                              ## DW_FORM_data1
+	.byte	11                              ## DW_AT_byte_size
+	.byte	11                              ## DW_FORM_data1
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	5                               ## Abbreviation Code
+	.byte	15                              ## DW_TAG_pointer_type
+	.byte	0                               ## DW_CHILDREN_no
+	.byte	73                              ## DW_AT_type
+	.byte	19                              ## DW_FORM_ref4
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	0                               ## EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit
+	.long	Lset0
+Ldebug_info_start0:
+	.short	5                               ## DWARF version number
+	.byte	1                               ## DWARF Unit Type
+	.byte	8                               ## Address Size (in bytes)
+.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+	.long	Lset1
+	.byte	1                               ## Abbrev [1] 0xc:0x50 DW_TAG_compile_unit
+	.byte	0                               ## DW_AT_producer
+	.short	29                              ## DW_AT_language
+	.byte	1                               ## DW_AT_name
+	.byte	2                               ## DW_AT_LLVM_sysroot
+	.byte	3                               ## DW_AT_APPLE_sdk
+.set Lset2, Lstr_offsets_base0-Lsection_str_off ## DW_AT_str_offsets_base
+	.long	Lset2
+.set Lset3, Lline_table_start0-Lsection_line ## DW_AT_stmt_list
+	.long	Lset3
+	.byte	4                               ## DW_AT_comp_dir
+                                        ## DW_AT_APPLE_optimized
+	.byte	0                               ## DW_AT_low_pc
+.set Lset4, Lfunc_end0-Lfunc_begin0     ## DW_AT_high_pc
+	.long	Lset4
+.set Lset5, Laddr_table_base0-Lsection_info0 ## DW_AT_addr_base
+	.long	Lset5
+	.byte	2                               ## Abbrev [2] 0x25:0x24 DW_TAG_subprogram
+	.byte	0                               ## DW_AT_low_pc
+.set Lset6, Lfunc_end0-Lfunc_begin0     ## DW_AT_high_pc
+	.long	Lset6
+	.byte	1                               ## DW_AT_frame_base
+	.byte	86
+                                        ## DW_AT_call_all_calls
+	.byte	5                               ## DW_AT_name
+	.byte	0                               ## DW_AT_decl_file
+	.byte	1                               ## DW_AT_decl_line
+                                        ## DW_AT_prototyped
+	.long	73                              ## DW_AT_type
+                                        ## DW_AT_external
+                                        ## DW_AT_APPLE_optimized
+	.byte	3                               ## Abbrev [3] 0x34:0xa DW_TAG_formal_parameter
+	.byte	1                               ## DW_AT_location
+	.byte	85
+	.byte	7                               ## DW_AT_name
+	.byte	0                               ## DW_AT_decl_file
+	.byte	1                               ## DW_AT_decl_line
+	.long	73                              ## DW_AT_type
+	.byte	3                               ## Abbrev [3] 0x3e:0xa DW_TAG_formal_parameter
+	.byte	1                               ## DW_AT_location
+	.byte	84
+	.byte	8                               ## DW_AT_name
+	.byte	0                               ## DW_AT_decl_file
+	.byte	1                               ## DW_AT_decl_line
+	.long	77                              ## DW_AT_type
+	.byte	0                               ## End Of Children Mark
+	.byte	4                               ## Abbrev [4] 0x49:0x4 DW_TAG_base_type
+	.byte	6                               ## DW_AT_name
+	.byte	5                               ## DW_AT_encoding
+	.byte	4                               ## DW_AT_byte_size
+	.byte	5                               ## Abbrev [5] 0x4d:0x5 DW_TAG_pointer_type
+	.long	82                              ## DW_AT_type
+	.byte	5                               ## Abbrev [5] 0x52:0x5 DW_TAG_pointer_type
+	.long	87                              ## DW_AT_type
+	.byte	4                               ## Abbrev [4] 0x57:0x4 DW_TAG_base_type
+	.byte	9                               ## DW_AT_name
+	.byte	6                               ## DW_AT_encoding
+	.byte	1                               ## DW_AT_byte_size
+	.byte	0                               ## End Of Children Mark
+Ldebug_info_end0:
+	.section	__DWARF,__debug_str_offs,regular,debug
+Lsection_str_off:
+	.long	44                              ## Length of String Offsets Set
+	.short	5
+	.short	0
+Lstr_offsets_base0:
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"Apple clang version 16.0.0 (clang-1600.0.26.3)" ## string offset=0
+	.asciz	"check-dwarf-filename.c"        ## string offset=47
+	.asciz	"/Library/Developer/CommandLineTools/SDKs/MacOSX15.0.sdk" ## string offset=70
+	.asciz	"MacOSX15.0.sdk"                ## string offset=126
+	.asciz	"/Users/lhames/Projects/scratch" ## string offset=141
+	.asciz	"main"                          ## string offset=172
+	.asciz	"int"                           ## string offset=177
+	.asciz	"argc"                          ## string offset=181
+	.asciz	"argv"                          ## string offset=186
+	.asciz	"char"                          ## string offset=191
+	.section	__DWARF,__debug_str_offs,regular,debug
+	.long	0
+	.long	47
+	.long	70
+	.long	126
+	.long	141
+	.long	172
+	.long	177
+	.long	181
+	.long	186
+	.long	191
+	.section	__DWARF,__debug_addr,regular,debug
+Lsection_info0:
+.set Lset7, Ldebug_addr_end0-Ldebug_addr_start0 ## Length of contribution
+	.long	Lset7
+Ldebug_addr_start0:
+	.short	5                               ## DWARF version number
+	.byte	8                               ## Address size
+	.byte	0                               ## Segment selector size
+Laddr_table_base0:
+	.quad	Lfunc_begin0
+Ldebug_addr_end0:
+	.section	__DWARF,__debug_names,regular,debug
+Ldebug_names_begin:
+.set Lset8, Lnames_end0-Lnames_start0   ## Header: unit length
+	.long	Lset8
+Lnames_start0:
+	.short	5                               ## Header: version
+	.short	0                               ## Header: padding
+	.long	1                               ## Header: compilation unit count
+	.long	0                               ## Header: local type unit count
+	.long	0                               ## Header: foreign type unit count
+	.long	3                               ## Header: bucket count
+	.long	3                               ## Header: name count
+.set Lset9, Lnames_abbrev_end0-Lnames_abbrev_start0 ## Header: abbreviation table size
+	.long	Lset9
+	.long	8                               ## Header: augmentation string size
+	.ascii	"LLVM0700"                      ## Header: augmentation string
+.set Lset10, Lcu_begin0-Lsection_info   ## Compilation unit 0
+	.long	Lset10
+	.long	0                               ## Bucket 0
+	.long	1                               ## Bucket 1
+	.long	2                               ## Bucket 2
+	.long	2090499946                      ## Hash in Bucket 1
+	.long	193495088                       ## Hash in Bucket 2
+	.long	2090147939                      ## Hash in Bucket 2
+	.long	172                             ## String in Bucket 1: main
+	.long	177                             ## String in Bucket 2: int
+	.long	191                             ## String in Bucket 2: char
+.set Lset11, Lnames0-Lnames_entries0    ## Offset in Bucket 1
+	.long	Lset11
+.set Lset12, Lnames1-Lnames_entries0    ## Offset in Bucket 2
+	.long	Lset12
+.set Lset13, Lnames2-Lnames_entries0    ## Offset in Bucket 2
+	.long	Lset13
+Lnames_abbrev_start0:
+	.ascii	"\230."                         ## Abbrev code
+	.byte	46                              ## DW_TAG_subprogram
+	.byte	3                               ## DW_IDX_die_offset
+	.byte	19                              ## DW_FORM_ref4
+	.byte	4                               ## DW_IDX_parent
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	0                               ## End of abbrev
+	.byte	0                               ## End of abbrev
+	.ascii	"\230$"                         ## Abbrev code
+	.byte	36                              ## DW_TAG_base_type
+	.byte	3                               ## DW_IDX_die_offset
+	.byte	19                              ## DW_FORM_ref4
+	.byte	4                               ## DW_IDX_parent
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	0                               ## End of abbrev
+	.byte	0                               ## End of abbrev
+	.byte	0                               ## End of abbrev list
+Lnames_abbrev_end0:
+Lnames_entries0:
+Lnames0:
+L1:
+	.ascii	"\230."                         ## Abbreviation code
+	.long	37                              ## DW_IDX_die_offset
+	.byte	0                               ## DW_IDX_parent
+                                        ## End of list: main
+Lnames1:
+L0:
+	.ascii	"\230$"                         ## Abbreviation code
+	.long	73                              ## DW_IDX_die_offset
+	.byte	0                               ## DW_IDX_parent
+                                        ## End of list: int
+Lnames2:
+L2:
+	.ascii	"\230$"                         ## Abbreviation code
+	.long	87                              ## DW_IDX_die_offset
+	.byte	0                               ## DW_IDX_parent
+                                        ## End of list: char
+	.p2align	2, 0x0
+Lnames_end0:
+.subsections_via_symbols
+	.section	__DWARF,__debug_line,regular,debug
+Lsection_line:
+Lline_table_start0:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
index e5783141..3852207 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-darwin11 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of compact-unwind sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
index 5a8cef5..3859a35 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macos10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Verify that PC-begin candidate symbols have been sorted correctly when adding
 # PC-begin edges for FDEs. In this test both _main and _X are at address zero,
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
index a5baf56..0d68a10 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec -entry hook %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -entry hook %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Verify that we split C string literals on null-terminators, rather than on
 # symbol boundaries. We expect four dead-stripped symbols: l_str.0, l_str.2,
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
index e1adb3b..66fcb47 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
@@ -4,7 +4,8 @@
 #
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 
 # CHECK:        Creating graph symbols...
 # CHECK:          Graphifying regular section __DATA,__data...
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll
new file mode 100644
index 0000000..a201174
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=hwasan -S | FileCheck %s
+; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW
+
+; This shows that HWASan will emit a memaccess check when dereferencing a null
+; pointer.
+; The output is used as the source for llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+define void @test_store_to_zeroptr() sanitize_hwaddress {
+; CHECK-LABEL: define void @test_store_to_zeroptr
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[B:%.*]] = inttoptr i64 0 to ptr
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[B]], i32 19)
+; CHECK-NEXT:    store i64 42, ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+; ABORT-ZERO-BASED-SHADOW-LABEL: define void @test_store_to_zeroptr
+; ABORT-ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
+; ABORT-ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
+; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = inttoptr i64 0 to ptr
+; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr [[B]], i32 19, i64 0)
+; ABORT-ZERO-BASED-SHADOW-NEXT:    store i64 42, ptr [[B]], align 8
+; ABORT-ZERO-BASED-SHADOW-NEXT:    ret void
+;
+entry:
+  %b = inttoptr i64 0 to i64*
+  store i64 42, ptr %b
+  ret void
+}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
index 78f3816..56cf3f5 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 ;.
 ; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
index 1aa47cac..117cd1a 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
index 94098bd..ea5adf6 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/anon.ll b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
index ce4f0c1b..37de1b7 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/anon.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
index 9b9522f..8ddc573 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
 ; Test basic type sanitizer instrumentation.
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic.ll b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
index 8873a40..704c188 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/byval.ll b/llvm/test/Instrumentation/TypeSanitizer/byval.ll
index 23ed1b0..6ae343d 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/byval.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/byval.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
 ; Test basic type sanitizer instrumentation.
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/globals.ll b/llvm/test/Instrumentation/TypeSanitizer/globals.ll
index 1f57c2a..a73599e 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/globals.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/globals.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll b/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
index e7de62e..0c99c0f 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 !llvm.tysan.globals = !{!0}
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll b/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
index 26f7c18..65a30bd 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
index 7b07a42..c7c153e 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll b/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
index 3cb7b83..060f031 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll b/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
index 5711fb4..dc83a02 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll b/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll
new file mode 100644
index 0000000..bb000ef
--- /dev/null
+++ b/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll
@@ -0,0 +1,2 @@
+target triple = "nvptx64-nvidia-gpulibs"
+target datalayout = "e-i64:64-i128:128-v32:32-n16:32:64"
diff --git a/llvm/test/Linker/cuda-libdevice.ll b/llvm/test/Linker/cuda-libdevice.ll
index 484e8339..87136b1 100644
--- a/llvm/test/Linker/cuda-libdevice.ll
+++ b/llvm/test/Linker/cuda-libdevice.ll
@@ -4,8 +4,8 @@
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-9.ll -o %t/libdevice.compute_35.10.bc
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-10.ll -o %t/libdevice.10.bc
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-11.ll -o %t/libdevice.11.10.bc
-; RUN: llvm-as %p/Inputs/libdevice-cuda-9.ll -o %t/correct-libdevice-wrong-filename.bc
 ; RUN: llvm-as %p/Inputs/not-a-libdevice.ll -o %t/libdevice-with-wrong-info.bc
+; RUN: llvm-as %p/Inputs/libdevice-with-wrong-dl.ll -o %t/libdevice-with-wrong-dl.bc
 
 ; No warnings expected when we link with libdevice variants
 ; RUN: llvm-link %t/main.bc %t/libdevice.compute_35.10.bc -S 2>&1 \
@@ -15,12 +15,12 @@
 ; RUN: llvm-link %t/main.bc %t/libdevice.11.10.bc -S 2>&1 \
 ; RUN:  | FileCheck --check-prefixes COMMON,NOWARN %s
 
-; But make sure we still issue warnings if we see unexpected filename, or
-; unexpected triple or datalayout within a libdevice filename.
-; RUN: llvm-link %t/main.bc %t/correct-libdevice-wrong-filename.bc -S 2>&1 \
-; RUN:  | FileCheck --check-prefixes COMMON,WARN-TRIPLE %s
+; But make sure we still issue warnings if we see unexpected triple or
+; datalayout within a libdevice module.
 ; RUN: llvm-link %t/main.bc %t/libdevice-with-wrong-info.bc -S 2>&1 \
 ; RUN:  | FileCheck --check-prefixes COMMON,WARN-TRIPLE,WARN-DL %s
+; RUN: llvm-link %t/main.bc %t/libdevice-with-wrong-dl.bc -S 2>&1 \
+; RUN:  | FileCheck --check-prefixes COMMON,NOWARN,WARN-DL %s
 
 
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s b/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
index baf05f1..093101b 100644
--- a/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
+++ b/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
@@ -2,10 +2,18 @@
 // RUN: llvm-mc -triple aarch64 -show-encoding %s  | FileCheck %s
 .func:
   apas x0
+  apas x1
+  apas x2
+  apas x17
+  apas x30
   mrs x3, GPCBW_EL3
   msr GPCBW_EL3, x4
 
 # CHECK:      .func:
-# CHECK-NEXT: 	apas	x0                              // encoding: [0x1f,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x0                              // encoding: [0x00,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x1                              // encoding: [0x01,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x2                              // encoding: [0x02,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x17                             // encoding: [0x11,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x30                             // encoding: [0x1e,0x70,0x0e,0xd5]
 # CHECK-NEXT: 	mrs	x3, GPCBW_EL3                   // encoding: [0xa3,0x21,0x3e,0xd5]
 # CHECK-NEXT: 	msr	GPCBW_EL3, x4                   // encoding: [0xa4,0x21,0x1e,0xd5]
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index 87a0987..a0565dc 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -573,3 +573,9 @@ v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
 
 v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+image_bvh_intersect_ray v[4:7], v[9:19], null
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], v[9:20], null
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
index bd61ad3..f6ea86e 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
@@ -359,3 +359,130 @@ image_sample_c_d_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_
 
 image_load v[0:1], v0, s[0:7] dmask:0x9 dim:1 D
 // NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid dim value
+
+// null is not allowed as SRSRC or SSAMP
+image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_fcmpswap v[1:2], v[2:3], null dmask:0x3 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_fmax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_fmin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], v[1:3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], v[1:3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s
new file mode 100644
index 0000000..5eb2e9c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s
@@ -0,0 +1,49 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s
+
+tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s
new file mode 100644
index 0000000..bd7acfe
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s
@@ -0,0 +1,160 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s
+
+buffer_atomic_add v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_x2 v[5:8], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_smax v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_smax_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_smin v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_smin_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_umax v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_umax_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_umin v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_umin_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_dword v5, v0, null, s3 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_dwordx2 v[5:6], v0, null, s3 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_dwordx3 v[5:7], v0, null, s3 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_dwordx4 v[5:8], v0, null, s3 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_sbyte v5, v0, null, s3 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_sshort v5, v0, null, s3 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_ubyte v5, v0, null, s3 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_ushort v5, v0, null, s3 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_byte v1, v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_dword v1, v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_dwordx2 v[1:2], v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_dwordx3 v[1:3], v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_dwordx4 v[1:4], v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_hi_x v1, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_x v1, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_xy v1, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_xyz v[1:2], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_xyzw v[1:3], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_x v1, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s
index b582de8..683a019 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s
@@ -281,6 +281,22 @@ s_load_dwordx16 s[20:35], s[2:3], 0x1234 glc dlc
 s_load_dwordx16 s[20:35], s[2:3], s0 offset:0x12345 glc dlc
 // GFX10: encoding: [0x01,0x45,0x11,0xf4,0x45,0x23,0x01,0x00]
 
+// null as dst
+s_load_dword null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_dwordx2 null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_dwordx4 null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_dwordx8 null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_dwordx16 null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00]
+
 s_buffer_load_dword s5, s[4:7], s0
 // GFX10: encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s
new file mode 100644
index 0000000..74c283c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s
@@ -0,0 +1,88 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s
+
+s_buffer_atomic_add s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_add_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_and s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_cmpswap s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_cmpswap_x2 s[4:7], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_dec s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_dec_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_inc s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_inc_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_or s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_or_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_smax s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_smax_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_smin s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_smin_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_sub s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_sub_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_swap s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_umax s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_umax_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_umin s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_umin_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dword s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx4 s[4:7], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx8 s[4:11], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx16 s[4:19], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_store_dword s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_atc_probe_buffer 7, null, s2
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
index 9bf72a1..2586198 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
@@ -400,3 +400,126 @@ image_store_pck v1, v[2:3], s[12:19] dmask:0x1 unorm
 image_store_mip_pck v1, v[2:3], s[12:19] dmask:0x0 unorm
 // NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
 
+// null is not allowed as SRSRC or SSAMP
+image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], v[1:3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], v[1:3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh_intersect_ray v[4:7], v[9:19], null
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], v[9:20], null
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s
new file mode 100644
index 0000000..3b69835
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s
@@ -0,0 +1,49 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s
+
+tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s
new file mode 100644
index 0000000..d3d74467
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s
@@ -0,0 +1,229 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s
+
+buffer_atomic_add_f32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_b32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_b64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_b32 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_b64 v[5:8], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_f32 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_csub_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_f32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_i32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_i64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_f32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_i32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_i64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_b32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_b64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_b32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_b64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_b32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_b64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_b128 v[5:8], v0, null, s3 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_b32 v5, v0, null, s3 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b64 v[1:2], v0, null, s4 idxen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b96 v[1:3], v0, null, s4 idxen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_b16 v5, v0, null, s3 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xy v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_i16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_b32 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_i16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_u16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_u16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b32 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b64 v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b96 v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xy v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_b8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_x v1, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem.s
index 1d6b947..e071c67 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem.s
@@ -239,6 +239,22 @@ s_load_b512 s[20:35], s[2:3], s0 glc dlc
 s_load_b512 s[20:35], s[2:3], 0x1234 glc dlc
 // GFX11: encoding: [0x01,0x65,0x10,0xf4,0x34,0x12,0x00,0xf8]
 
+// null as dst
+s_load_b32 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b256 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b512 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00]
+
 s_buffer_load_b32 s5, s[4:7], s0
 // GFX11: encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s
new file mode 100644
index 0000000..7dd6ded
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s
@@ -0,0 +1,34 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s
+
+s_buffer_load_b32 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b64 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b128 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b256 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b512 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dword s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx2 s[4:5], null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx4 s[4:7], null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx8 s[4:11], null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx16 s[4:19], null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_atc_probe_buffer 7, null, s2
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index 4e4dc66..1aefd1f 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -269,50 +269,65 @@ v_clz_i32_u32 v5, src_scc
 v_clz_i32_u32 v255, 0xaf123456
 // GFX11: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cos_f16 v5, v1
-// GFX11: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
+v_cos_f16 v5.l, v1.l
+// GFX11: v_cos_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xc3,0x0a,0x7e]
 
-v_cos_f16 v5, v127
-// GFX11: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
+v_cos_f16 v5.l, v127.l
+// GFX11: v_cos_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
-v_cos_f16 v5, s1
-// GFX11: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, s1
+// GFX11: v_cos_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, s105
-// GFX11: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, s105
+// GFX11: v_cos_f16_e32 v5.l, s105                ; encoding: [0x69,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, vcc_lo
-// GFX11: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, vcc_lo
+// GFX11: v_cos_f16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, vcc_hi
-// GFX11: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, vcc_hi
+// GFX11: v_cos_f16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, ttmp15
-// GFX11: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, ttmp15
+// GFX11: v_cos_f16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, m0
-// GFX11: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, m0
+// GFX11: v_cos_f16_e32 v5.l, m0                  ; encoding: [0x7d,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, exec_lo
-// GFX11: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, exec_lo
+// GFX11: v_cos_f16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, exec_hi
-// GFX11: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, exec_hi
+// GFX11: v_cos_f16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, null
-// GFX11: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, null
+// GFX11: v_cos_f16_e32 v5.l, null                ; encoding: [0x7c,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, -1
-// GFX11: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, -1
+// GFX11: v_cos_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, 0.5
-// GFX11: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, 0.5
+// GFX11: v_cos_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, src_scc
-// GFX11: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, src_scc
+// GFX11: v_cos_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
-v_cos_f16 v127, 0xfe0b
-// GFX11: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_cos_f16 v127.l, 0xfe0b
+// GFX11: v_cos_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16 v5.l, v1.h
+// GFX11: v_cos_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5.l, v127.h
+// GFX11: v_cos_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc3,0x0a,0x7e]
+
+v_cos_f16 v127.l, 0.5
+// GFX11: v_cos_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc2,0xfe,0x7e]
+
+v_cos_f16 v5.h, src_scc
+// GFX11: v_cos_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7f]
+
+v_cos_f16 v127.h, 0xfe0b
+// GFX11: v_cos_f16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32 v5, v1
 // GFX11: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
@@ -1271,11 +1286,11 @@ v_cvt_i32_f64 v5, src_scc
 v_cvt_i32_f64 v255, 0xaf123456
 // GFX11: v_cvt_i32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16 v5, v1
-// GFX11: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
+v_cvt_i32_i16 v5, v1.l
+// GFX11: v_cvt_i32_i16_e32 v5, v1.l              ; encoding: [0x01,0xd5,0x0a,0x7e]
 
-v_cvt_i32_i16 v5, v127
-// GFX11: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
+v_cvt_i32_i16 v5, v127.l
+// GFX11: v_cvt_i32_i16_e32 v5, v127.l            ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, s1
 // GFX11: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
@@ -1316,6 +1331,12 @@ v_cvt_i32_i16 v5, src_scc
 v_cvt_i32_i16 v255, 0xfe0b
 // GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+v_cvt_i32_i16 v5, v1.h
+// GFX11: v_cvt_i32_i16_e32 v5, v1.h              ; encoding: [0x81,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, v127.h
+// GFX11: v_cvt_i32_i16_e32 v5, v127.h            ; encoding: [0xff,0xd5,0x0a,0x7e]
+
 v_cvt_nearest_i32_f32 v5, v1
 // GFX11: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
 
@@ -1685,11 +1706,11 @@ v_cvt_u32_f64 v5, src_scc
 v_cvt_u32_f64 v255, 0xaf123456
 // GFX11: v_cvt_u32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16 v5, v1
-// GFX11: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
+v_cvt_u32_u16 v5, v1.l
+// GFX11: v_cvt_u32_u16_e32 v5, v1.l              ; encoding: [0x01,0xd7,0x0a,0x7e]
 
-v_cvt_u32_u16 v5, v127
-// GFX11: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
+v_cvt_u32_u16 v5, v127.l
+// GFX11: v_cvt_u32_u16_e32 v5, v127.l            ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, s1
 // GFX11: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
@@ -1730,6 +1751,12 @@ v_cvt_u32_u16 v5, src_scc
 v_cvt_u32_u16 v255, 0xfe0b
 // GFX11: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+v_cvt_u32_u16 v5, v1.h
+// GFX11: v_cvt_u32_u16_e32 v5, v1.h              ; encoding: [0x81,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, v127.h
+// GFX11: v_cvt_u32_u16_e32 v5, v127.h            ; encoding: [0xff,0xd7,0x0a,0x7e]
+
 v_exp_f16 v5.l, v1.l
 // GFX11: v_exp_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xb1,0x0a,0x7e]
 
@@ -2093,50 +2120,65 @@ v_floor_f64 v[5:6], src_scc
 v_floor_f64 v[254:255], 0xaf123456
 // GFX11: v_floor_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
-v_fract_f16 v5, v1
-// GFX11: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
+v_fract_f16 v5.l, v1.l
+// GFX11: v_fract_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, v127.l
+// GFX11: v_fract_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, s1
+// GFX11: v_fract_f16_e32 v5.l, s1                ; encoding: [0x01,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, v127
-// GFX11: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
+v_fract_f16 v5.l, s105
+// GFX11: v_fract_f16_e32 v5.l, s105              ; encoding: [0x69,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, s1
-// GFX11: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, vcc_lo
+// GFX11: v_fract_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, s105
-// GFX11: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, vcc_hi
+// GFX11: v_fract_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, vcc_lo
-// GFX11: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, ttmp15
+// GFX11: v_fract_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, vcc_hi
-// GFX11: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, m0
+// GFX11: v_fract_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, ttmp15
-// GFX11: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, exec_lo
+// GFX11: v_fract_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, m0
-// GFX11: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, exec_hi
+// GFX11: v_fract_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, exec_lo
-// GFX11: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, null
+// GFX11: v_fract_f16_e32 v5.l, null              ; encoding: [0x7c,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, exec_hi
-// GFX11: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, -1
+// GFX11: v_fract_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, null
-// GFX11: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, 0.5
+// GFX11: v_fract_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, -1
-// GFX11: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, src_scc
+// GFX11: v_fract_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, 0.5
-// GFX11: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
+v_fract_f16 v127.l, 0xfe0b
+// GFX11: v_fract_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_fract_f16 v5, src_scc
-// GFX11: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, v1.h
+// GFX11: v_fract_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbf,0x0a,0x7e]
 
-v_fract_f16 v127, 0xfe0b
-// GFX11: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_fract_f16 v5.l, v127.h
+// GFX11: v_fract_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbf,0x0a,0x7e]
+
+v_fract_f16 v127.l, 0.5
+// GFX11: v_fract_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbe,0xfe,0x7e]
+
+v_fract_f16 v5.h, src_scc
+// GFX11: v_fract_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7f]
+
+v_fract_f16 v127.h, 0xfe0b
+// GFX11: v_fract_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32 v5, v1
 // GFX11: v_fract_f32_e32 v5, v1                  ; encoding: [0x01,0x41,0x0a,0x7e]
@@ -2351,50 +2393,65 @@ v_frexp_exp_i32_f64 v5, src_scc
 v_frexp_exp_i32_f64 v255, 0xaf123456
 // GFX11: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16 v5, v1
-// GFX11: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, v1.l
+// GFX11: v_frexp_mant_f16_e32 v5.l, v1.l         ; encoding: [0x01,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, v127.l
+// GFX11: v_frexp_mant_f16_e32 v5.l, v127.l       ; encoding: [0x7f,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, s1
+// GFX11: v_frexp_mant_f16_e32 v5.l, s1           ; encoding: [0x01,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, v127
-// GFX11: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, s105
+// GFX11: v_frexp_mant_f16_e32 v5.l, s105         ; encoding: [0x69,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, s1
-// GFX11: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, vcc_lo
+// GFX11: v_frexp_mant_f16_e32 v5.l, vcc_lo       ; encoding: [0x6a,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, s105
-// GFX11: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, vcc_hi
+// GFX11: v_frexp_mant_f16_e32 v5.l, vcc_hi       ; encoding: [0x6b,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, vcc_lo
-// GFX11: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, ttmp15
+// GFX11: v_frexp_mant_f16_e32 v5.l, ttmp15       ; encoding: [0x7b,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, vcc_hi
-// GFX11: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, m0
+// GFX11: v_frexp_mant_f16_e32 v5.l, m0           ; encoding: [0x7d,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, ttmp15
-// GFX11: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, exec_lo
+// GFX11: v_frexp_mant_f16_e32 v5.l, exec_lo      ; encoding: [0x7e,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, m0
-// GFX11: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, exec_hi
+// GFX11: v_frexp_mant_f16_e32 v5.l, exec_hi      ; encoding: [0x7f,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, exec_lo
-// GFX11: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, null
+// GFX11: v_frexp_mant_f16_e32 v5.l, null         ; encoding: [0x7c,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, exec_hi
-// GFX11: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, -1
+// GFX11: v_frexp_mant_f16_e32 v5.l, -1           ; encoding: [0xc1,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, null
-// GFX11: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, 0.5
+// GFX11: v_frexp_mant_f16_e32 v5.l, 0.5          ; encoding: [0xf0,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, -1
-// GFX11: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, src_scc
+// GFX11: v_frexp_mant_f16_e32 v5.l, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, 0.5
-// GFX11: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v127.l, 0xfe0b
+// GFX11: v_frexp_mant_f16_e32 v127.l, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16 v5, src_scc
-// GFX11: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, v1.h
+// GFX11: v_frexp_mant_f16_e32 v5.l, v1.h         ; encoding: [0x81,0xb3,0x0a,0x7e]
 
-v_frexp_mant_f16 v127, 0xfe0b
-// GFX11: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16 v5.l, v127.h
+// GFX11: v_frexp_mant_f16_e32 v5.l, v127.h       ; encoding: [0xff,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v127.l, 0.5
+// GFX11: v_frexp_mant_f16_e32 v127.l, 0.5        ; encoding: [0xf0,0xb2,0xfe,0x7e]
+
+v_frexp_mant_f16 v5.h, src_scc
+// GFX11: v_frexp_mant_f16_e32 v5.h, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7f]
+
+v_frexp_mant_f16 v127.h, 0xfe0b
+// GFX11: v_frexp_mant_f16_e32 v127.h, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1
 // GFX11: v_frexp_mant_f32_e32 v5, v1             ; encoding: [0x01,0x81,0x0a,0x7e]
@@ -2684,50 +2741,65 @@ v_movrelsd_b32 v255, v255
 v_nop
 // GFX11: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
 
-v_not_b16 v5, v1
-// GFX11: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
+v_not_b16 v5.l, v1.l
+// GFX11: v_not_b16_e32 v5.l, v1.l                ; encoding: [0x01,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, v127.l
+// GFX11: v_not_b16_e32 v5.l, v127.l              ; encoding: [0x7f,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, s1
+// GFX11: v_not_b16_e32 v5.l, s1                  ; encoding: [0x01,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, v127
-// GFX11: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
+v_not_b16 v5.l, s105
+// GFX11: v_not_b16_e32 v5.l, s105                ; encoding: [0x69,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, s1
-// GFX11: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, vcc_lo
+// GFX11: v_not_b16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, s105
-// GFX11: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, vcc_hi
+// GFX11: v_not_b16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, vcc_lo
-// GFX11: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, ttmp15
+// GFX11: v_not_b16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, vcc_hi
-// GFX11: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, m0
+// GFX11: v_not_b16_e32 v5.l, m0                  ; encoding: [0x7d,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, ttmp15
-// GFX11: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, exec_lo
+// GFX11: v_not_b16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, m0
-// GFX11: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, exec_hi
+// GFX11: v_not_b16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, exec_lo
-// GFX11: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, null
+// GFX11: v_not_b16_e32 v5.l, null                ; encoding: [0x7c,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, exec_hi
-// GFX11: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, -1
+// GFX11: v_not_b16_e32 v5.l, -1                  ; encoding: [0xc1,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, null
-// GFX11: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, 0.5
+// GFX11: v_not_b16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, -1
-// GFX11: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, src_scc
+// GFX11: v_not_b16_e32 v5.l, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, 0.5
-// GFX11: v_not_b16_e32 v5, 0.5                   ; encoding: [0xf0,0xd2,0x0a,0x7e]
+v_not_b16 v127.l, 0xfe0b
+// GFX11: v_not_b16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_not_b16 v5, src_scc
-// GFX11: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, v1.h
+// GFX11: v_not_b16_e32 v5.l, v1.h                ; encoding: [0x81,0xd3,0x0a,0x7e]
 
-v_not_b16 v127, 0xfe0b
-// GFX11: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_not_b16 v5.l, v127.h
+// GFX11: v_not_b16_e32 v5.l, v127.h              ; encoding: [0xff,0xd3,0x0a,0x7e]
+
+v_not_b16 v127.l, 0.5
+// GFX11: v_not_b16_e32 v127.l, 0.5               ; encoding: [0xf0,0xd2,0xfe,0x7e]
+
+v_not_b16 v5.h, src_scc
+// GFX11: v_not_b16_e32 v5.h, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7f]
+
+v_not_b16 v127.h, 0xfe0b
+// GFX11: v_not_b16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_not_b32 v5, v1
 // GFX11: v_not_b32_e32 v5, v1                    ; encoding: [0x01,0x6f,0x0a,0x7e]
@@ -2978,50 +3050,65 @@ v_readfirstlane_b32 ttmp15, v1
 v_readfirstlane_b32 null, v255
 // GFX11: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
-v_rndne_f16 v5, v1
-// GFX11: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.l
+// GFX11: v_rndne_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.l
+// GFX11: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, s1
+// GFX11: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5.l, s105
+// GFX11: v_rndne_f16_e32 v5.l, s105              ; encoding: [0x69,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, v127
-// GFX11: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_lo
+// GFX11: v_rndne_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s1
-// GFX11: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_hi
+// GFX11: v_rndne_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s105
-// GFX11: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, ttmp15
+// GFX11: v_rndne_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_lo
-// GFX11: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, m0
+// GFX11: v_rndne_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_hi
-// GFX11: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_lo
+// GFX11: v_rndne_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, ttmp15
-// GFX11: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_hi
+// GFX11: v_rndne_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, m0
-// GFX11: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, null
+// GFX11: v_rndne_f16_e32 v5.l, null              ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_lo
-// GFX11: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, -1
+// GFX11: v_rndne_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_hi
-// GFX11: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, 0.5
+// GFX11: v_rndne_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, null
-// GFX11: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, src_scc
+// GFX11: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, -1
-// GFX11: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+v_rndne_f16 v127.l, 0xfe0b
+// GFX11: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v5, 0.5
-// GFX11: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.h
+// GFX11: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, src_scc
-// GFX11: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, v127.h
+// GFX11: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v127, 0xfe0b
-// GFX11: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v127.l, 0.5
+// GFX11: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+
+v_rndne_f16 v5.h, src_scc
+// GFX11: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+v_rndne_f16 v127.h, 0xfe0b
+// GFX11: v_rndne_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
 // GFX11: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
@@ -3236,95 +3323,119 @@ v_rsq_f64 v[5:6], src_scc
 v_rsq_f64 v[254:255], 0xaf123456
 // GFX11: v_rsq_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
-v_sat_pk_u8_i16 v5, v1
-// GFX11: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, v1
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, v1            ; encoding: [0x01,0xc5,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5.l, v255
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, v255          ; encoding: [0xff,0xc5,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5.l, s1
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, s1            ; encoding: [0x01,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5.l, s105
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, s105          ; encoding: [0x69,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, v255
-// GFX11: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, vcc_lo
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, vcc_lo        ; encoding: [0x6a,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, s1
-// GFX11: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, vcc_hi
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, vcc_hi        ; encoding: [0x6b,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, s105
-// GFX11: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, ttmp15
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, ttmp15        ; encoding: [0x7b,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, vcc_lo
-// GFX11: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, m0
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, m0            ; encoding: [0x7d,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, vcc_hi
-// GFX11: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, exec_lo
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, exec_lo       ; encoding: [0x7e,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, ttmp15
-// GFX11: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, exec_hi
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, exec_hi       ; encoding: [0x7f,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, m0
-// GFX11: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, null
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, null          ; encoding: [0x7c,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, exec_lo
-// GFX11: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, -1
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, -1            ; encoding: [0xc1,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, exec_hi
-// GFX11: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, 0.5
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, 0.5           ; encoding: [0xf0,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, null
-// GFX11: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, src_scc
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, src_scc       ; encoding: [0xfd,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, -1
-// GFX11: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v127.l, 0xfe0b
+// GFX11: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b      ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_sat_pk_u8_i16 v5, 0.5
-// GFX11: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v127.l, 0.5
+// GFX11: v_sat_pk_u8_i16_e32 v127.l, 0.5         ; encoding: [0xf0,0xc4,0xfe,0x7e]
 
-v_sat_pk_u8_i16 v5, src_scc
-// GFX11: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.h, src_scc
+// GFX11: v_sat_pk_u8_i16_e32 v5.h, src_scc       ; encoding: [0xfd,0xc4,0x0a,0x7f]
 
-v_sat_pk_u8_i16 v127, 0xfe0b
-// GFX11: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_sat_pk_u8_i16 v127.h, 0xfe0b
+// GFX11: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b      ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
-v_sin_f16 v5, v1
-// GFX11: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
+v_sin_f16 v5.l, v1.l
+// GFX11: v_sin_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xc1,0x0a,0x7e]
 
-v_sin_f16 v5, v127
-// GFX11: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
+v_sin_f16 v5.l, v127.l
+// GFX11: v_sin_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
-v_sin_f16 v5, s1
-// GFX11: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, s1
+// GFX11: v_sin_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, s105
-// GFX11: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, s105
+// GFX11: v_sin_f16_e32 v5.l, s105                ; encoding: [0x69,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, vcc_lo
-// GFX11: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, vcc_lo
+// GFX11: v_sin_f16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, vcc_hi
-// GFX11: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, vcc_hi
+// GFX11: v_sin_f16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, ttmp15
-// GFX11: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, ttmp15
+// GFX11: v_sin_f16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, m0
-// GFX11: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, m0
+// GFX11: v_sin_f16_e32 v5.l, m0                  ; encoding: [0x7d,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, exec_lo
-// GFX11: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, exec_lo
+// GFX11: v_sin_f16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, exec_hi
-// GFX11: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, exec_hi
+// GFX11: v_sin_f16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, null
-// GFX11: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, null
+// GFX11: v_sin_f16_e32 v5.l, null                ; encoding: [0x7c,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, -1
-// GFX11: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, -1
+// GFX11: v_sin_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, 0.5
-// GFX11: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, 0.5
+// GFX11: v_sin_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, src_scc
-// GFX11: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, src_scc
+// GFX11: v_sin_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
-v_sin_f16 v127, 0xfe0b
-// GFX11: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_sin_f16 v127.l, 0xfe0b
+// GFX11: v_sin_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v5.l, v1.h
+// GFX11: v_sin_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5.l, v127.h
+// GFX11: v_sin_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc1,0x0a,0x7e]
+
+v_sin_f16 v127.l, 0.5
+// GFX11: v_sin_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc0,0xfe,0x7e]
+
+v_sin_f16 v5.h, src_scc
+// GFX11: v_sin_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7f]
+
+v_sin_f16 v127.h, 0xfe0b
+// GFX11: v_sin_f16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32 v5, v1
 // GFX11: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
@@ -3524,50 +3635,65 @@ v_swaprel_b32 v5, v1
 v_swaprel_b32 v255, v255
 // GFX11: v_swaprel_b32 v255, v255                ; encoding: [0xff,0xd1,0xfe,0x7f]
 
-v_trunc_f16 v5, v1
-// GFX11: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
+v_trunc_f16 v5.l, v1.l
+// GFX11: v_trunc_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, v127.l
+// GFX11: v_trunc_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, s1
+// GFX11: v_trunc_f16_e32 v5.l, s1                ; encoding: [0x01,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, s105
+// GFX11: v_trunc_f16_e32 v5.l, s105              ; encoding: [0x69,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, vcc_lo
+// GFX11: v_trunc_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, vcc_hi
+// GFX11: v_trunc_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, v127
-// GFX11: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
+v_trunc_f16 v5.l, ttmp15
+// GFX11: v_trunc_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, s1
-// GFX11: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, m0
+// GFX11: v_trunc_f16_e32 v5.l, m0                ; encoding: [0x7d,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, s105
-// GFX11: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, exec_lo
+// GFX11: v_trunc_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, vcc_lo
-// GFX11: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, exec_hi
+// GFX11: v_trunc_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, vcc_hi
-// GFX11: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, null
+// GFX11: v_trunc_f16_e32 v5.l, null              ; encoding: [0x7c,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, ttmp15
-// GFX11: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, -1
+// GFX11: v_trunc_f16_e32 v5.l, -1                ; encoding: [0xc1,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, m0
-// GFX11: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, 0.5
+// GFX11: v_trunc_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, exec_lo
-// GFX11: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, src_scc
+// GFX11: v_trunc_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, exec_hi
-// GFX11: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
+v_trunc_f16 v127.l, 0xfe0b
+// GFX11: v_trunc_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_trunc_f16 v5, null
-// GFX11: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, v1.h
+// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
 
-v_trunc_f16 v5, -1
-// GFX11: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, v127.h
+// GFX11: v_trunc_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbb,0x0a,0x7e]
 
-v_trunc_f16 v5, 0.5
-// GFX11: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
+v_trunc_f16 v127.l, 0.5
+// GFX11: v_trunc_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xba,0xfe,0x7e]
 
-v_trunc_f16 v5, src_scc
-// GFX11: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
+v_trunc_f16 v5.h, src_scc
+// GFX11: v_trunc_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7f]
 
-v_trunc_f16 v127, 0xfe0b
-// GFX11: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_trunc_f16 v127.h, 0xfe0b
+// GFX11: v_trunc_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32 v5, v1
 // GFX11: v_trunc_f32_e32 v5, v1                  ; encoding: [0x01,0x43,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
index 98e4b29..2bdb9ec 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
@@ -212,47 +212,56 @@ v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_cos_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cos_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_cos_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cos_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cos_f16 v5, v1 row_mirror
-// GFX11: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_mirror
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cos_f16 v5, v1 row_half_mirror
-// GFX11: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shl:1
-// GFX11: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shl:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shl:15
-// GFX11: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shl:15
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shr:1
-// GFX11: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shr:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shr:15
-// GFX11: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shr:15
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_ror:1
-// GFX11: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_ror:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cos_f16 v5, v1 row_ror:15
-// GFX11: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_ror:15
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cos_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cos_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_cos_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cos_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cos_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_cos_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_cos_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_cos_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -926,47 +935,56 @@ v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_i32_i16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i32_i16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_i32_i16 v5, v1.l row_mirror
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_mirror
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_half_mirror
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_half_mirror
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shl:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shl:1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shl:15
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shl:15
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shr:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shr:1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shr:15
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shr:15
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_ror:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_ror:1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_ror:15
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_ror:15
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_i32_i16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_i32_i16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
 
-v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+v_cvt_i32_i16 v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_i32_i16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_cvt_i32_i16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x09,0x13]
+
+v_cvt_i32_i16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1262,47 +1280,56 @@ v_cvt_u32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cvt_u32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_u32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_cvt_u32_u16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_u32_u16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1.l row_mirror
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_u32_u16 v5, v1.l row_half_mirror
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_mirror
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shl:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_half_mirror
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shl:15
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shl:1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shr:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shl:15
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shr:15
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shr:1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_ror:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shr:15
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_ror:15
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_ror:1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_ror:15
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cvt_u32_u16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cvt_u32_u16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_u32_u16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cvt_u32_u16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
 
-v_cvt_u32_u16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_u32_u16 v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_u32_u16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_cvt_u32_u16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+v_cvt_u32_u16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cvt_u32_u16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x09,0x13]
+
+v_cvt_u32_u16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cvt_u32_u16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_exp_f16 v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: v_exp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1598,47 +1625,56 @@ v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_fract_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_fract_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_fract_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_fract_f16 v5.l, v1.l row_mirror
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_fract_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_fract_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_fract_f16 v5.l, v1.l row_shl:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_fract_f16 v5, v1 row_mirror
-// GFX11: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shl:15
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_half_mirror
-// GFX11: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shr:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shl:1
-// GFX11: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shr:15
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shl:15
-// GFX11: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_ror:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shr:1
-// GFX11: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_ror:15
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shr:15
-// GFX11: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_fract_f16 v5, v1 row_ror:1
-// GFX11: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_fract_f16 v5, v1 row_ror:15
-// GFX11: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_fract_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_fract_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fract_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_fract_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_fract_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_fract_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1766,47 +1802,56 @@ v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f
 v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l row_mirror
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shl:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_mirror
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shl:15
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_half_mirror
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shr:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shl:1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shr:15
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shl:15
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_ror:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shr:1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_ror:15
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shr:15
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_ror:1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_frexp_mant_f16 v5, v1 row_ror:15
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_frexp_mant_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_frexp_mant_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_frexp_mant_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_frexp_mant_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2144,47 +2189,56 @@ v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_not_b16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_not_b16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_not_b16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_not_b16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_not_b16 v5, v1 row_mirror
-// GFX11: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_not_b16 v5.l, v1.l row_mirror
+// GFX11: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_not_b16 v5, v1 row_half_mirror
-// GFX11: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_not_b16 v5.l, v1.l row_half_mirror
+// GFX11: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_not_b16 v5, v1 row_shl:1
-// GFX11: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shl:1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_not_b16 v5, v1 row_shl:15
-// GFX11: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shl:15
+// GFX11: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_not_b16 v5, v1 row_shr:1
-// GFX11: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shr:1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_not_b16 v5, v1 row_shr:15
-// GFX11: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shr:15
+// GFX11: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_not_b16 v5, v1 row_ror:1
-// GFX11: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_not_b16 v5.l, v1.l row_ror:1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_not_b16 v5, v1 row_ror:15
-// GFX11: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_ror:15
+// GFX11: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_not_b16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_not_b16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_not_b16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+v_not_b16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_not_b16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_not_b16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_not_b16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_not_b16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_not_b32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2354,47 +2408,56 @@ v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rndne_f16 v5.l, v1.l row_mirror
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_mirror
-// GFX11: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_half_mirror
-// GFX11: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rndne_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rndne_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rndne_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_rndne_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_rndne_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2522,89 +2585,107 @@ v_rsq_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rsq_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_sat_pk_u8_i16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16 v5.l, v1 quad_perm:[3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sat_pk_u8_i16 v5.l, v1 quad_perm:[0,1,2,3]
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sat_pk_u8_i16 v5.l, v1 row_mirror
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_half_mirror
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_mirror
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_shl:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_half_mirror
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_shl:15
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_shl:1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_shr:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_shl:15
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_shr:15
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_shr:1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_ror:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_shr:15
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_ror:15
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_ror:1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_ror:15
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_sat_pk_u8_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_sat_pk_u8_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_sat_pk_u8_i16 v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30]
 
-v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_sat_pk_u8_i16 v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sat_pk_u8_i16_dpp v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01]
 
-v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30]
+v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x09,0x13]
 
-v_sin_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_sin_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_sin_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_sin_f16 v5, v1 row_mirror
-// GFX11: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_sin_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_sin_f16 v5, v1 row_half_mirror
-// GFX11: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_mirror
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shl:1
-// GFX11: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shl:15
-// GFX11: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shl:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shr:1
-// GFX11: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shl:15
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shr:15
-// GFX11: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shr:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_sin_f16 v5, v1 row_ror:1
-// GFX11: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shr:15
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_ror:15
-// GFX11: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_ror:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_ror:15
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_sin_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_sin_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_sin_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sin_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_sin_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sin_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_sin_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_sin_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_sin_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2732,47 +2813,56 @@ v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_trunc_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_trunc_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16 v5.l, v1.l row_mirror
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_trunc_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_trunc_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_trunc_f16 v5.l, v1.l row_shl:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_mirror
-// GFX11: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shl:15
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_half_mirror
-// GFX11: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shr:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shl:1
-// GFX11: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shr:15
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shl:15
-// GFX11: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_ror:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shr:1
-// GFX11: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_ror:15
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shr:15
-// GFX11: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_ror:1
-// GFX11: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16 v5, v1 row_ror:15
-// GFX11: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_trunc_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_trunc_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_trunc_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_trunc_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_trunc_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_trunc_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
index ab4606a..ba0c349 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
@@ -50,14 +50,23 @@ v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_cos_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cos_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_cos_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_cos_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -218,14 +227,23 @@ v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 
-v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_cvt_i32_i16 v5, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -299,14 +317,23 @@ v_cvt_u32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_u32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_u32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cvt_u32_u16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 
-v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x7f,0x77,0x39,0x05]
 
-v_cvt_u32_u16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_cvt_u32_u16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cvt_u32_u16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_exp_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_exp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -377,14 +404,23 @@ v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_fract_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05]
 
-v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_fract_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_fract_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -416,14 +452,23 @@ v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -500,14 +545,23 @@ v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_not_b16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_not_b16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_not_b16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_not_b16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -548,14 +602,23 @@ v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_rndne_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -587,23 +650,41 @@ v_rsq_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rsq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v127.l, v255 dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16 v127.l, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_dpp v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05]
 
-v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_sin_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_sin_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_sin_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -635,14 +716,23 @@ v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05]
 
-v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_trunc_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_trunc_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 4ae9134..dea33dc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
 
@@ -47,6 +47,12 @@ v_ceil_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_cos_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_cos_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_cos_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -56,6 +62,24 @@ v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_cos_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_cos_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -65,6 +89,24 @@ v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_cos_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_cvt_f16_f32_e32 v128, 0xaf123456
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -236,6 +278,24 @@ v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
 
+v_cvt_i32_i16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
 v_cvt_norm_i16_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
@@ -371,6 +431,24 @@ v_cvt_u32_u16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_u32_u16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
 
+v_cvt_u32_u16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
 v_exp_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -458,6 +536,12 @@ v_floor_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_fract_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_fract_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_fract_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -467,6 +551,24 @@ v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_fract_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_fract_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -476,6 +578,24 @@ v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_fract_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_frexp_exp_i16_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
@@ -521,6 +641,12 @@ v_frexp_exp_i16_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_frexp_mant_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_frexp_mant_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
 v_frexp_mant_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -530,6 +656,24 @@ v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction
 
+v_frexp_mant_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
 v_frexp_mant_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -539,6 +683,24 @@ v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction
 
+v_frexp_mant_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
 v_log_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -584,6 +746,12 @@ v_log_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_not_b16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
+v_not_b16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_not_b16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -593,6 +761,24 @@ v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
+v_not_b16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_not_b16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
@@ -602,6 +788,24 @@ v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
+v_not_b16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_rcp_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -647,6 +851,12 @@ v_rcp_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_rndne_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_rndne_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -656,6 +866,24 @@ v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -665,6 +893,24 @@ v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_rsq_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -716,9 +962,33 @@ v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
 v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:30: error: invalid operand for instruction
 
+v_sat_pk_u8_i16_e32 v199.h, v5.h
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.h, v5.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.h, v5.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5.l
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_sin_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_sin_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_sin_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -728,6 +998,24 @@ v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_sin_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_sin_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -737,6 +1025,24 @@ v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_sin_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_sqrt_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction
 
@@ -794,6 +1100,12 @@ v_swap_b16_e32 v128.l, v0.l
 v_trunc_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_trunc_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_trunc_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -803,6 +1115,24 @@ v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_trunc_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_trunc_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -811,3 +1141,21 @@ v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 
 v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
index 1d44172..5cb81c6 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s
 
@@ -68,71 +68,137 @@ v_ceil_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_ceil_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_ceil_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cos_f16 v128, 0xfe0b
-// GFX11: v_cos_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_cos_f16 v128.h, 0xfe0b
+// GFX11: v_cos_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_cos_f16 v255, -1
-// GFX11: v_cos_f16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16 v128.l, 0xfe0b
+// GFX11: v_cos_f16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_cos_f16 v255, 0.5
-// GFX11: v_cos_f16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00]
+v_cos_f16 v255.h, -1
+// GFX11: v_cos_f16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16 v255, exec_hi
-// GFX11: v_cos_f16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16 v255.h, 0.5
+// GFX11: v_cos_f16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe1,0xd5,0xf0,0x00,0x00,0x00]
 
-v_cos_f16 v255, exec_lo
-// GFX11: v_cos_f16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16 v255.h, exec_hi
+// GFX11: v_cos_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16 v255, m0
-// GFX11: v_cos_f16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16 v255.h, exec_lo
+// GFX11: v_cos_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16 v255, null
-// GFX11: v_cos_f16_e64 v255, null                ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16 v255.h, m0
+// GFX11: v_cos_f16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16 v255, s1
-// GFX11: v_cos_f16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16 v255.h, null
+// GFX11: v_cos_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16 v255, s105
-// GFX11: v_cos_f16_e64 v255, s105                ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16 v255.h, s1
+// GFX11: v_cos_f16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16 v255, src_scc
-// GFX11: v_cos_f16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00]
+v_cos_f16 v255.h, s105
+// GFX11: v_cos_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16 v255, ttmp15
-// GFX11: v_cos_f16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16 v255.h, src_scc
+// GFX11: v_cos_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xfd,0x00,0x00,0x00]
 
-v_cos_f16 v255, v1
-// GFX11: v_cos_f16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16 v255.h, ttmp15
+// GFX11: v_cos_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_f16 v255.h, v1.h
+// GFX11: v_cos_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v255, v127
-// GFX11: v_cos_f16_e64 v255, v127                ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00]
+v_cos_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cos_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_cos_f16 v255.h, v127.h
+// GFX11: v_cos_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x7f,0x01,0x00,0x00]
 
-v_cos_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_cos_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_cos_f16 v255, vcc_hi
-// GFX11: v_cos_f16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_cos_f16 v255, vcc_lo
-// GFX11: v_cos_f16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16 v255.h, vcc_hi
+// GFX11: v_cos_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199
-// GFX11: v_cos_f16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+v_cos_f16 v255.h, vcc_lo
+// GFX11: v_cos_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cos_f16 v255.l, -1
+// GFX11: v_cos_f16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cos_f16 v255.l, 0.5
+// GFX11: v_cos_f16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, exec_hi
+// GFX11: v_cos_f16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, exec_lo
+// GFX11: v_cos_f16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, m0
+// GFX11: v_cos_f16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, null
+// GFX11: v_cos_f16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, s1
+// GFX11: v_cos_f16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, s105
+// GFX11: v_cos_f16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, src_scc
+// GFX11: v_cos_f16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, ttmp15
+// GFX11: v_cos_f16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, v1.l
+// GFX11: v_cos_f16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cos_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cos_f16 v255.l, v127.l
+// GFX11: v_cos_f16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00]
+
+v_cos_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_cos_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_cos_f16 v255.l, vcc_hi
+// GFX11: v_cos_f16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, vcc_lo
+// GFX11: v_cos_f16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_f16 v5.h, v199.h
+// GFX11: v_cos_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cos_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cos_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_cos_f16 v5.l, v199.l
+// GFX11: v_cos_f16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cos_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cos_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_cvt_f16_f32 v128.h, 0xaf123456
 // GFX11: v_cvt_f16_f32_e64 v128.h, 0xaf123456 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
@@ -662,14 +728,23 @@ v_cvt_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX11: v_cvt_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd3,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_i32_i16 v5, v199
-// GFX11: v_cvt_i32_i16_e64 v5, v199              ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00]
+v_cvt_i32_i16 v5, v199.h
+// GFX11: v_cvt_i32_i16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_i32_i16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_i32_i16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cvt_i32_i16 v5, v199.l
+// GFX11: v_cvt_i32_i16_e64 v5, v199.l            ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00]
 
-v_cvt_i32_i16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cvt_i32_i16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_cvt_norm_i16_f16 v128.h, 0xfe0b
 // GFX11: v_cvt_norm_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe3,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1067,14 +1142,23 @@ v_cvt_u16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_u16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX11: v_cvt_u16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd2,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_u32_u16 v5, v199
-// GFX11: v_cvt_u32_u16_e64 v5, v199              ; encoding: [0x05,0x00,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+v_cvt_u32_u16 v5, v199.h
+// GFX11: v_cvt_u32_u16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_cvt_u32_u16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_u32_u16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cvt_u32_u16 v5, v199.l
+// GFX11: v_cvt_u32_u16_e64 v5, v199.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_exp_f16 v128, 0xfe0b
 // GFX11: v_exp_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd8,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1208,71 +1292,137 @@ v_floor_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_floor_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_floor_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_fract_f16 v128, 0xfe0b
-// GFX11: v_fract_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_fract_f16 v128.h, 0xfe0b
+// GFX11: v_fract_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v128.l, 0xfe0b
+// GFX11: v_fract_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v255.h, -1
+// GFX11: v_fract_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, 0.5
+// GFX11: v_fract_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xf0,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, exec_hi
+// GFX11: v_fract_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, exec_lo
+// GFX11: v_fract_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, m0
+// GFX11: v_fract_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, null
+// GFX11: v_fract_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, s1
+// GFX11: v_fract_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x01,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, s105
+// GFX11: v_fract_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x69,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, src_scc
+// GFX11: v_fract_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xfd,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, ttmp15
+// GFX11: v_fract_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, v1.h
+// GFX11: v_fract_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_fract_f16 v255, -1
-// GFX11: v_fract_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_fract_f16 v255, 0.5
-// GFX11: v_fract_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h
+// GFX11: v_fract_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x7f,0x01,0x00,0x00]
 
-v_fract_f16 v255, exec_hi
-// GFX11: v_fract_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_fract_f16 v255, exec_lo
-// GFX11: v_fract_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_fract_f16 v255, m0
-// GFX11: v_fract_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16 v255.h, vcc_hi
+// GFX11: v_fract_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16 v255, null
-// GFX11: v_fract_f16_e64 v255, null              ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16 v255.h, vcc_lo
+// GFX11: v_fract_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16 v255, s1
-// GFX11: v_fract_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16 v255.l, -1
+// GFX11: v_fract_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16 v255, s105
-// GFX11: v_fract_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16 v255.l, 0.5
+// GFX11: v_fract_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00]
 
-v_fract_f16 v255, src_scc
-// GFX11: v_fract_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00]
+v_fract_f16 v255.l, exec_hi
+// GFX11: v_fract_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16 v255, ttmp15
-// GFX11: v_fract_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16 v255.l, exec_lo
+// GFX11: v_fract_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1
-// GFX11: v_fract_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16 v255.l, m0
+// GFX11: v_fract_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_fract_f16 v255.l, null
+// GFX11: v_fract_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_fract_f16 v255.l, s1
+// GFX11: v_fract_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127
-// GFX11: v_fract_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00]
+v_fract_f16 v255.l, s105
+// GFX11: v_fract_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_fract_f16 v255.l, src_scc
+// GFX11: v_fract_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_fract_f16 v255.l, ttmp15
+// GFX11: v_fract_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16 v255, vcc_hi
-// GFX11: v_fract_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16 v255.l, v1.l
+// GFX11: v_fract_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
-v_fract_f16 v255, vcc_lo
-// GFX11: v_fract_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_fract_f16 v5, v199
-// GFX11: v_fract_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+v_fract_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_fract_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_fract_f16 v255.l, v127.l
+// GFX11: v_fract_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00]
 
-v_fract_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_fract_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_fract_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_fract_f16 v255.l, vcc_hi
+// GFX11: v_fract_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+
+v_fract_f16 v255.l, vcc_lo
+// GFX11: v_fract_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f16 v5.h, v199.h
+// GFX11: v_fract_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+
+v_fract_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_fract_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_fract_f16 v5.l, v199.l
+// GFX11: v_fract_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+
+v_fract_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_fract_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_frexp_exp_i16_f16 v128.h, 0xfe0b
 // GFX11: v_frexp_exp_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xda,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1406,71 +1556,137 @@ v_frexp_exp_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_exp_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX11: v_frexp_exp_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xda,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v128, 0xfe0b
-// GFX11: v_frexp_mant_f16_e64 v128, 0xfe0b       ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16 v128.h, 0xfe0b
+// GFX11: v_frexp_mant_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v128.l, 0xfe0b
+// GFX11: v_frexp_mant_f16_e64 v128.l, 0xfe0b     ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, -1
+// GFX11: v_frexp_mant_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, 0.5
+// GFX11: v_frexp_mant_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, exec_hi
+// GFX11: v_frexp_mant_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, exec_lo
+// GFX11: v_frexp_mant_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, m0
+// GFX11: v_frexp_mant_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, null
+// GFX11: v_frexp_mant_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, s1
+// GFX11: v_frexp_mant_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, s105
+// GFX11: v_frexp_mant_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, src_scc
+// GFX11: v_frexp_mant_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, ttmp15
+// GFX11: v_frexp_mant_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, -1
-// GFX11: v_frexp_mant_f16_e64 v255, -1           ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h
+// GFX11: v_frexp_mant_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v255, 0.5
-// GFX11: v_frexp_mant_f16_e64 v255, 0.5          ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v255, exec_hi
-// GFX11: v_frexp_mant_f16_e64 v255, exec_hi      ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v255, exec_lo
-// GFX11: v_frexp_mant_f16_e64 v255, exec_lo      ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h
+// GFX11: v_frexp_mant_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x7f,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v255, m0
-// GFX11: v_frexp_mant_f16_e64 v255, m0           ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v255, null
-// GFX11: v_frexp_mant_f16_e64 v255, null         ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v255, s1
-// GFX11: v_frexp_mant_f16_e64 v255, s1           ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, vcc_hi
+// GFX11: v_frexp_mant_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, s105
-// GFX11: v_frexp_mant_f16_e64 v255, s105         ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, vcc_lo
+// GFX11: v_frexp_mant_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, src_scc
-// GFX11: v_frexp_mant_f16_e64 v255, src_scc      ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, -1
+// GFX11: v_frexp_mant_f16_e64 v255.l, -1         ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, ttmp15
-// GFX11: v_frexp_mant_f16_e64 v255, ttmp15       ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, 0.5
+// GFX11: v_frexp_mant_f16_e64 v255.l, 0.5        ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1
-// GFX11: v_frexp_mant_f16_e64 v255, v1           ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, exec_hi
+// GFX11: v_frexp_mant_f16_e64 v255.l, exec_hi    ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, exec_lo
+// GFX11: v_frexp_mant_f16_e64 v255.l, exec_lo    ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, m0
+// GFX11: v_frexp_mant_f16_e64 v255.l, m0         ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127
-// GFX11: v_frexp_mant_f16_e64 v255, v127         ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, null
+// GFX11: v_frexp_mant_f16_e64 v255.l, null       ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, s1
+// GFX11: v_frexp_mant_f16_e64 v255.l, s1         ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, s105
+// GFX11: v_frexp_mant_f16_e64 v255.l, s105       ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, vcc_hi
-// GFX11: v_frexp_mant_f16_e64 v255, vcc_hi       ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, src_scc
+// GFX11: v_frexp_mant_f16_e64 v255.l, src_scc    ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, vcc_lo
-// GFX11: v_frexp_mant_f16_e64 v255, vcc_lo       ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, ttmp15
+// GFX11: v_frexp_mant_f16_e64 v255.l, ttmp15     ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v5, v199
-// GFX11: v_frexp_mant_f16_e64 v5, v199           ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, v1.l
+// GFX11: v_frexp_mant_f16_e64 v255.l, v1.l       ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v255.l, v127.l
+// GFX11: v_frexp_mant_f16_e64 v255.l, v127.l     ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v255.l, vcc_hi
+// GFX11: v_frexp_mant_f16_e64 v255.l, vcc_hi     ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.l, vcc_lo
+// GFX11: v_frexp_mant_f16_e64 v255.l, vcc_lo     ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v5.h, v199.h
+// GFX11: v_frexp_mant_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v199.l
+// GFX11: v_frexp_mant_f16_e64 v5.l, v199.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_log_f16 v128, 0xfe0b
 // GFX11: v_log_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd7,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1538,71 +1754,137 @@ v_log_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_log_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_log_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_not_b16 v128, 0xfe0b
-// GFX11: v_not_b16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16 v128.h, 0xfe0b
+// GFX11: v_not_b16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v128.l, 0xfe0b
+// GFX11: v_not_b16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v255.h, -1
+// GFX11: v_not_b16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_not_b16 v255.h, 0.5
+// GFX11: v_not_b16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_not_b16 v255.h, exec_hi
+// GFX11: v_not_b16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_not_b16 v255.h, exec_lo
+// GFX11: v_not_b16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_not_b16 v255.h, m0
+// GFX11: v_not_b16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_not_b16 v255.h, null
+// GFX11: v_not_b16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_not_b16 v255.h, s1
+// GFX11: v_not_b16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0x01,0x00,0x00,0x00]
+
+v_not_b16 v255.h, s105
+// GFX11: v_not_b16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16 v255, -1
-// GFX11: v_not_b16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16 v255.h, src_scc
+// GFX11: v_not_b16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16 v255, 0.5
-// GFX11: v_not_b16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16 v255.h, ttmp15
+// GFX11: v_not_b16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16 v255, exec_hi
-// GFX11: v_not_b16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h
+// GFX11: v_not_b16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
-v_not_b16 v255, exec_lo
-// GFX11: v_not_b16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_not_b16 v255, m0
-// GFX11: v_not_b16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_not_b16 v255, null
-// GFX11: v_not_b16_e64 v255, null                ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h
+// GFX11: v_not_b16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x7f,0x01,0x00,0x00]
 
-v_not_b16 v255, s1
-// GFX11: v_not_b16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_not_b16 v255, s105
-// GFX11: v_not_b16_e64 v255, s105                ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_not_b16 v255, src_scc
-// GFX11: v_not_b16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16 v255.h, vcc_hi
+// GFX11: v_not_b16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16 v255, ttmp15
-// GFX11: v_not_b16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16 v255.h, vcc_lo
+// GFX11: v_not_b16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16 v255, v1
-// GFX11: v_not_b16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16 v255.l, -1
+// GFX11: v_not_b16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16 v255.l, 0.5
+// GFX11: v_not_b16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_not_b16 v255.l, exec_hi
+// GFX11: v_not_b16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16 v255, v127
-// GFX11: v_not_b16_e64 v255, v127                ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00]
+v_not_b16 v255.l, exec_lo
+// GFX11: v_not_b16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_not_b16 v255.l, m0
+// GFX11: v_not_b16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_not_b16 v255.l, null
+// GFX11: v_not_b16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16 v255, vcc_hi
-// GFX11: v_not_b16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16 v255.l, s1
+// GFX11: v_not_b16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16 v255, vcc_lo
-// GFX11: v_not_b16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16 v255.l, s105
+// GFX11: v_not_b16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16 v5, v199
-// GFX11: v_not_b16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+v_not_b16 v255.l, src_scc
+// GFX11: v_not_b16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_not_b16 v255.l, ttmp15
+// GFX11: v_not_b16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_not_b16 v255.l, v1.l
+// GFX11: v_not_b16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_not_b16 v255.l, v127.l
+// GFX11: v_not_b16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00]
+
+v_not_b16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_not_b16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_not_b16 v255.l, vcc_hi
+// GFX11: v_not_b16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_not_b16 v255.l, vcc_lo
+// GFX11: v_not_b16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_not_b16 v5.h, v199.h
+// GFX11: v_not_b16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_not_b16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_not_b16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_not_b16 v5.l, v199.l
+// GFX11: v_not_b16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_not_b16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_not_b16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rcp_f16 v128, 0xfe0b
 // GFX11: v_rcp_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd4,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1670,71 +1952,137 @@ v_rcp_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rcp_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_rcp_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_rndne_f16 v128, 0xfe0b
-// GFX11: v_rndne_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v128.h, 0xfe0b
+// GFX11: v_rndne_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v128.l, 0xfe0b
+// GFX11: v_rndne_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v255.h, -1
+// GFX11: v_rndne_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, 0.5
+// GFX11: v_rndne_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xf0,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, exec_hi
+// GFX11: v_rndne_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, exec_lo
+// GFX11: v_rndne_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, m0
+// GFX11: v_rndne_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, null
+// GFX11: v_rndne_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, s1
+// GFX11: v_rndne_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v255, -1
-// GFX11: v_rndne_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s105
+// GFX11: v_rndne_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v255, 0.5
-// GFX11: v_rndne_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+v_rndne_f16 v255.h, src_scc
+// GFX11: v_rndne_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_hi
-// GFX11: v_rndne_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v255.h, ttmp15
+// GFX11: v_rndne_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_lo
-// GFX11: v_rndne_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h
+// GFX11: v_rndne_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16 v255, m0
-// GFX11: v_rndne_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v255, null
-// GFX11: v_rndne_f16_e64 v255, null              ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, s1
-// GFX11: v_rndne_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h
+// GFX11: v_rndne_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x7f,0x01,0x00,0x00]
 
-v_rndne_f16 v255, s105
-// GFX11: v_rndne_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_rndne_f16 v255, src_scc
-// GFX11: v_rndne_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, ttmp15
-// GFX11: v_rndne_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_hi
+// GFX11: v_rndne_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1
-// GFX11: v_rndne_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16 v255.h, vcc_lo
+// GFX11: v_rndne_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16 v255.l, -1
+// GFX11: v_rndne_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, 0.5
+// GFX11: v_rndne_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127
-// GFX11: v_rndne_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+v_rndne_f16 v255.l, exec_hi
+// GFX11: v_rndne_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_rndne_f16 v255.l, exec_lo
+// GFX11: v_rndne_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, m0
+// GFX11: v_rndne_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16 v255, vcc_hi
-// GFX11: v_rndne_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16 v255.l, null
+// GFX11: v_rndne_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16 v255, vcc_lo
-// GFX11: v_rndne_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16 v255.l, s1
+// GFX11: v_rndne_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199
-// GFX11: v_rndne_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+v_rndne_f16 v255.l, s105
+// GFX11: v_rndne_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_rndne_f16 v255.l, src_scc
+// GFX11: v_rndne_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, ttmp15
+// GFX11: v_rndne_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l
+// GFX11: v_rndne_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, v127.l
+// GFX11: v_rndne_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, vcc_hi
+// GFX11: v_rndne_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, vcc_lo
+// GFX11: v_rndne_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h
+// GFX11: v_rndne_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v199.l
+// GFX11: v_rndne_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rsq_f16 v128, 0xfe0b
 // GFX11: v_rsq_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd6,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1802,80 +2150,155 @@ v_rsq_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rsq_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_rsq_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_sat_pk_u8_i16 v199, v5
-// GFX11: v_sat_pk_u8_i16_e64 v199, v5            ; encoding: [0xc7,0x00,0xe2,0xd5,0x05,0x01,0x00,0x00]
+v_sat_pk_u8_i16 v199.h, v5
+// GFX11: v_sat_pk_u8_i16_e64 v199.h, v5 op_sel:[0,1] ; encoding: [0xc7,0x40,0xe2,0xd5,0x05,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
+
+v_sat_pk_u8_i16 v199.l, v5
+// GFX11: v_sat_pk_u8_i16_e64 v199.l, v5          ; encoding: [0xc7,0x00,0xe2,0xd5,0x05,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v199.l, v5 quad_perm:[3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
+
+v_sin_f16 v128.h, 0xfe0b
+// GFX11: v_sin_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sat_pk_u8_i16 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v199, v5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05]
+v_sin_f16 v128.l, 0xfe0b
+// GFX11: v_sin_f16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sat_pk_u8_i16 v199, v5 quad_perm:[3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v199, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
+v_sin_f16 v255.h, -1
+// GFX11: v_sin_f16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16 v128, 0xfe0b
-// GFX11: v_sin_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_sin_f16 v255.h, 0.5
+// GFX11: v_sin_f16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe0,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sin_f16 v255, -1
-// GFX11: v_sin_f16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16 v255.h, exec_hi
+// GFX11: v_sin_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16 v255, 0.5
-// GFX11: v_sin_f16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00]
+v_sin_f16 v255.h, exec_lo
+// GFX11: v_sin_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16 v255, exec_hi
-// GFX11: v_sin_f16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16 v255.h, m0
+// GFX11: v_sin_f16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16 v255, exec_lo
-// GFX11: v_sin_f16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16 v255.h, null
+// GFX11: v_sin_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16 v255, m0
-// GFX11: v_sin_f16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16 v255.h, s1
+// GFX11: v_sin_f16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16 v255, null
-// GFX11: v_sin_f16_e64 v255, null                ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16 v255.h, s105
+// GFX11: v_sin_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16 v255, s1
-// GFX11: v_sin_f16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16 v255.h, src_scc
+// GFX11: v_sin_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xfd,0x00,0x00,0x00]
 
-v_sin_f16 v255, s105
-// GFX11: v_sin_f16_e64 v255, s105                ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16 v255.h, ttmp15
+// GFX11: v_sin_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16 v255, src_scc
-// GFX11: v_sin_f16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h
+// GFX11: v_sin_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16 v255, ttmp15
-// GFX11: v_sin_f16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v255, v1
-// GFX11: v_sin_f16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sin_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_sin_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sin_f16 v255.h, v127.h
+// GFX11: v_sin_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x7f,0x01,0x00,0x00]
 
-v_sin_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sin_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_sin_f16 v255, v127
-// GFX11: v_sin_f16_e64 v255, v127                ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00]
+v_sin_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_sin_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_sin_f16 v255.h, vcc_hi
+// GFX11: v_sin_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_sin_f16 v255.h, vcc_lo
+// GFX11: v_sin_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16 v255, vcc_hi
-// GFX11: v_sin_f16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16 v255.l, -1
+// GFX11: v_sin_f16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16 v255, vcc_lo
-// GFX11: v_sin_f16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16 v255.l, 0.5
+// GFX11: v_sin_f16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199
-// GFX11: v_sin_f16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+v_sin_f16 v255.l, exec_hi
+// GFX11: v_sin_f16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_sin_f16 v255.l, exec_lo
+// GFX11: v_sin_f16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_sin_f16 v255.l, m0
+// GFX11: v_sin_f16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, null
+// GFX11: v_sin_f16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, s1
+// GFX11: v_sin_f16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, s105
+// GFX11: v_sin_f16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, src_scc
+// GFX11: v_sin_f16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, ttmp15
+// GFX11: v_sin_f16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, v1.l
+// GFX11: v_sin_f16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_sin_f16 v255.l, v127.l
+// GFX11: v_sin_f16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00]
+
+v_sin_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_sin_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_sin_f16 v255.l, vcc_hi
+// GFX11: v_sin_f16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, vcc_lo
+// GFX11: v_sin_f16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_f16 v5.h, v199.h
+// GFX11: v_sin_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+
+v_sin_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_sin_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_sin_f16 v5.l, v199.l
+// GFX11: v_sin_f16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+
+v_sin_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_sin_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_sqrt_f16 v128, 0xfe0b
 // GFX11: v_sqrt_f16_e64 v128, 0xfe0b             ; encoding: [0x80,0x00,0xd5,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1943,69 +2366,134 @@ v_sqrt_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_sqrt_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_sqrt_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_trunc_f16 v128, 0xfe0b
-// GFX11: v_trunc_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_trunc_f16 v128.h, 0xfe0b
+// GFX11: v_trunc_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v128.l, 0xfe0b
+// GFX11: v_trunc_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v255.h, -1
+// GFX11: v_trunc_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, 0.5
+// GFX11: v_trunc_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, exec_hi
+// GFX11: v_trunc_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, exec_lo
+// GFX11: v_trunc_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, m0
+// GFX11: v_trunc_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, null
+// GFX11: v_trunc_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, s1
+// GFX11: v_trunc_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, s105
+// GFX11: v_trunc_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x69,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, src_scc
+// GFX11: v_trunc_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xfd,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, ttmp15
+// GFX11: v_trunc_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, v1.h
+// GFX11: v_trunc_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v255.h, v127.h
+// GFX11: v_trunc_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x7f,0x01,0x00,0x00]
+
+v_trunc_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_trunc_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_trunc_f16 v255.h, vcc_hi
+// GFX11: v_trunc_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, vcc_lo
+// GFX11: v_trunc_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, -1
+// GFX11: v_trunc_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, 0.5
+// GFX11: v_trunc_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00]
 
-v_trunc_f16 v255, -1
-// GFX11: v_trunc_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16 v255.l, exec_hi
+// GFX11: v_trunc_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16 v255, 0.5
-// GFX11: v_trunc_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+v_trunc_f16 v255.l, exec_lo
+// GFX11: v_trunc_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16 v255, exec_hi
-// GFX11: v_trunc_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16 v255.l, m0
+// GFX11: v_trunc_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16 v255, exec_lo
-// GFX11: v_trunc_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16 v255.l, null
+// GFX11: v_trunc_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16 v255, m0
-// GFX11: v_trunc_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16 v255.l, s1
+// GFX11: v_trunc_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
-v_trunc_f16 v255, null
-// GFX11: v_trunc_f16_e64 v255, null              ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16 v255.l, s105
+// GFX11: v_trunc_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16 v255, s1
-// GFX11: v_trunc_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16 v255.l, src_scc
+// GFX11: v_trunc_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00]
 
-v_trunc_f16 v255, s105
-// GFX11: v_trunc_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16 v255.l, ttmp15
+// GFX11: v_trunc_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16 v255, src_scc
-// GFX11: v_trunc_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l
+// GFX11: v_trunc_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16 v255, ttmp15
-// GFX11: v_trunc_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_trunc_f16 v255, v1
-// GFX11: v_trunc_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_trunc_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_trunc_f16 v255.l, v127.l
+// GFX11: v_trunc_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00]
 
-v_trunc_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_trunc_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_trunc_f16 v255, v127
-// GFX11: v_trunc_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00]
+v_trunc_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_trunc_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_trunc_f16 v255.l, vcc_hi
+// GFX11: v_trunc_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_trunc_f16 v255.l, vcc_lo
+// GFX11: v_trunc_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16 v255, vcc_hi
-// GFX11: v_trunc_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16 v5.h, v199.h
+// GFX11: v_trunc_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0xc7,0x01,0x00,0x00]
 
-v_trunc_f16 v255, vcc_lo
-// GFX11: v_trunc_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_trunc_f16 v5, v199
-// GFX11: v_trunc_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00]
+v_trunc_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_trunc_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_trunc_f16 v5.l, v199.l
+// GFX11: v_trunc_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00]
 
+v_trunc_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_trunc_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_trunc_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 628edf74..6bc92bc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -2231,53 +2231,77 @@ v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
 v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
 // GFX11: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
-v_fma_f16 v5, v1, v2, s3
-// GFX11: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+v_fma_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
 
-v_fma_f16 v5, v255, s2, s105
-// GFX11: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+v_fma_f16 v5.l, v255.l, s2, s105
+// GFX11: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
 
-v_fma_f16 v5, s1, v255, exec_hi
-// GFX11: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+v_fma_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
 
-v_fma_f16 v5, s105, s105, exec_lo
-// GFX11: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+v_fma_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_fma_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_fma_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_fma_f16 v5, m0, 0.5, m0
-// GFX11: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+v_fma_f16 v5.l, m0, 0.5, m0
+// GFX11: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_fma_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
 
-v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX11: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX11: v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX11: v_fma_f16 v5, null, exec_lo, -|0xfe0b|  ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX11: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX11: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX11: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX11: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX11: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX11: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX11: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2
-// GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2
+// GFX11: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, v255.h, s2, s105
+// GFX11: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+v_fma_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+
+v_fma_f16 v5.l, 0.5, -m0, 0.5
+// GFX11: v_fma_f16 v5.l, 0.5, -m0, 0.5           ; encoding: [0x05,0x00,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX11: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1  ; encoding: [0x05,0x02,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+
+v_fma_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_fma_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_fma_f32 v5, v1, v2, s3
 // GFX11: v_fma_f32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
@@ -3722,50 +3746,62 @@ v_max_u16 v5.l, v255.l, v255.h
 v_max_u16 v255.h, 0xfe0b, vcc_hi
 // GFX11: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, v1, v2, s3
-// GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+v_maxmin_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 
-v_maxmin_f16 v5, v255, s2, s105
-// GFX11: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+v_maxmin_f16 v5.l, v255.l, s2, s105
+// GFX11: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maxmin_f16 v5, s1, v255, exec_hi
-// GFX11: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+v_maxmin_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maxmin_f16 v5, s105, s105, exec_lo
-// GFX11: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+v_maxmin_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_maxmin_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_maxmin_f16 v5, m0, 0.5, m0
-// GFX11: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maxmin_f16 v5.l, m0, 0.5, m0
+// GFX11: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maxmin_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX11: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX11: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_f16 v5.l, v255.h, s2, s105
+// GFX11: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+
+v_maxmin_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+
+v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_maxmin_f32 v5, v1, v2, s3
 // GFX11: v_maxmin_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00]
@@ -4799,50 +4835,62 @@ v_min_u16 v5.l, v255.l, v255.h
 v_min_u16 v255.h, 0xfe0b, vcc_hi
 // GFX11: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, v1, v2, s3
-// GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+v_minmax_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+
+v_minmax_f16 v5.l, v255.l, s2, s105
+// GFX11: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+
+v_minmax_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+
+v_minmax_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+
+v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_minmax_f16 v5, v255, s2, s105
-// GFX11: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, s1, v255, exec_hi
-// GFX11: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minmax_f16 v5, s105, s105, exec_lo
-// GFX11: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+v_minmax_f16 v5.l, m0, 0.5, m0
+// GFX11: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minmax_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minmax_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, m0, 0.5, m0
-// GFX11: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minmax_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX11: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX11: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_minmax_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minmax_f16 v5.l, v255.h, s2, s105
+// GFX11: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
 
-v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_minmax_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
 
-v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_minmax_f32 v5, v1, v2, s3
 // GFX11: v_minmax_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index acbdcfc..5fa1334 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -1508,47 +1508,83 @@ v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 ban
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+
+v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x09,0x13]
+
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2624,47 +2660,92 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3668,47 +3749,92 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+
+v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4833,20 +4959,20 @@ v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX11: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX11: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
 // GFX11: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x53,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
index f38ff6a..c26834c 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
@@ -220,47 +220,56 @@ v_clz_i32_u32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_cos_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_cos_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -967,47 +976,50 @@ v_cvt_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_mirror
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+
+v_cvt_i32_i16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1330,47 +1342,50 @@ v_cvt_u32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_mirror
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_cvt_u32_u16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1684,47 +1699,56 @@ v_floor_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_fract_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_fract_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1861,47 +1885,56 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2248,47 +2281,56 @@ v_movrelsd_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:
 v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_mirror
-// GFX11: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_not_b16_e64_dpp v5.h, v1.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+
+v_not_b16_e64_dpp v5.l, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+
+v_not_b16_e64_dpp v255.h, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2467,47 +2509,56 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2644,89 +2695,101 @@ v_rsq_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl
 v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_sin_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_sin_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2863,47 +2926,56 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_trunc_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_trunc_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index cef5017..2fc0206 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -814,41 +814,74 @@ v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x54,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x48,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x48,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x48,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x48,0xd6,0xea,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1627,41 +1660,80 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2401,41 +2473,80 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3130,20 +3241,20 @@ v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX11: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX11: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x53,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
index 9540788..259be1d 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
@@ -61,17 +61,26 @@ v_clz_i32_u32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xb9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_cos_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_cos_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -283,14 +292,17 @@ v_cvt_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x88,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -385,14 +397,17 @@ v_cvt_u32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x87,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_cvt_u32_u16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x08,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -487,17 +502,26 @@ v_floor_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xa4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_fract_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -538,17 +562,26 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0xbf,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -640,14 +673,23 @@ v_movrelsd_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xc4,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_not_b16_e64_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v255.h, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -703,17 +745,26 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xab,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -760,26 +811,38 @@ v_rsq_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xae,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_sin_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_sin_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -826,17 +889,26 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xb3,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
index 3850f02..379cf062 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
@@ -262,50 +262,59 @@ v_clz_i32_u32_e64 v5, src_scc
 v_clz_i32_u32_e64 v255, 0xaf123456
 // GFX11: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cos_f16_e64 v5, v1
-// GFX11: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v1.l
+// GFX11: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, v255
-// GFX11: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v255.l
+// GFX11: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, s1
-// GFX11: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s1
+// GFX11: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, s105
-// GFX11: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s105
+// GFX11: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_lo
-// GFX11: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_lo
+// GFX11: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_hi
-// GFX11: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_hi
+// GFX11: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, ttmp15
-// GFX11: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, ttmp15
+// GFX11: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, m0
-// GFX11: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, m0
+// GFX11: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_lo
-// GFX11: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_lo
+// GFX11: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_hi
-// GFX11: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_hi
+// GFX11: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, null
-// GFX11: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, null
+// GFX11: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, -1
-// GFX11: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, -1
+// GFX11: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, 0.5 mul:2
-// GFX11: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+v_cos_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cos_f16_e64 v5, src_scc mul:4
-// GFX11: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+v_cos_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cos_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32_e64 v5, v1
 // GFX11: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
@@ -1267,11 +1276,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc|
 v_cvt_i32_f64_e64 v255, 0xaf123456 clamp
 // GFX11: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16_e64 v5, v1
-// GFX11: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v1.l
+// GFX11: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_i32_i16_e64 v5, v255
-// GFX11: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v255.l
+// GFX11: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_i32_i16_e64 v5, s1
 // GFX11: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1312,6 +1321,9 @@ v_cvt_i32_i16_e64 v5, src_scc
 v_cvt_i32_i16_e64 v255, 0xfe0b
 // GFX11: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_cvt_i32_i16_e64 v5, v255.h
+// GFX11: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00]
+
 v_cvt_nearest_i32_f32_e64 v5, v1
 // GFX11: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
 
@@ -1690,11 +1702,11 @@ v_cvt_u32_f64_e64 v5, -|src_scc|
 v_cvt_u32_f64_e64 v255, 0xaf123456 clamp
 // GFX11: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16_e64 v5, v1
-// GFX11: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v1.l
+// GFX11: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_u16_e64 v5, v255
-// GFX11: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v255.l
+// GFX11: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_u32_u16_e64 v5, s1
 // GFX11: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1735,6 +1747,9 @@ v_cvt_u32_u16_e64 v5, src_scc
 v_cvt_u32_u16_e64 v255, 0xfe0b
 // GFX11: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_cvt_u32_u16_e64 v5, v255.h
+// GFX11: [0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
 v_exp_f16_e64 v5, v1
 // GFX11: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2086,50 +2101,59 @@ v_floor_f64_e64 v[5:6], -|src_scc| mul:4
 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX11: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_fract_f16_e64 v5, v1
-// GFX11: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, v1.l
+// GFX11: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5.l, v255.l
+// GFX11: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
-v_fract_f16_e64 v5, v255
-// GFX11: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, s1
+// GFX11: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s1
-// GFX11: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, s105
+// GFX11: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s105
-// GFX11: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_lo
+// GFX11: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_lo
-// GFX11: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_hi
+// GFX11: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_hi
-// GFX11: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, ttmp15
+// GFX11: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, ttmp15
-// GFX11: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, m0
+// GFX11: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, m0
-// GFX11: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_lo
+// GFX11: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_lo
-// GFX11: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_hi
+// GFX11: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_hi
-// GFX11: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, null
+// GFX11: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, null
-// GFX11: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, -1
+// GFX11: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, -1
-// GFX11: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
-v_fract_f16_e64 v5, 0.5 mul:2
-// GFX11: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+v_fract_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
-v_fract_f16_e64 v5, src_scc mul:4
-// GFX11: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_fract_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_fract_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+v_fract_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32_e64 v5, v1
 // GFX11: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
@@ -2347,50 +2371,59 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc|
 v_frexp_exp_i32_f64_e64 v255, 0xaf123456
 // GFX11: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16_e64 v5, v1
-// GFX11: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, v1.l
+// GFX11: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5.l, v255.l
+// GFX11: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, v255
-// GFX11: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s1
+// GFX11: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s1
-// GFX11: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s105
+// GFX11: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s105
-// GFX11: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_lo
+// GFX11: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_lo
-// GFX11: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_hi
+// GFX11: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_hi
-// GFX11: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, ttmp15
+// GFX11: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, ttmp15
-// GFX11: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, m0
+// GFX11: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, m0
-// GFX11: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_lo
+// GFX11: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_lo
-// GFX11: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_hi
+// GFX11: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_hi
-// GFX11: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, null
+// GFX11: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, null
-// GFX11: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, -1
+// GFX11: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, -1
-// GFX11: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
-v_frexp_mant_f16_e64 v5, 0.5 mul:2
-// GFX11: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+v_frexp_mant_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
-v_frexp_mant_f16_e64 v5, src_scc mul:4
-// GFX11: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32_e64 v5, v1
 // GFX11: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
@@ -2674,50 +2707,59 @@ v_movrelsd_b32_e64 v255, v255
 v_nop_e64
 // GFX11: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, v1
-// GFX11: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, v1.l
+// GFX11: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5.l, v255.l
+// GFX11: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
-v_not_b16_e64 v5, v255
-// GFX11: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, s1
+// GFX11: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s1
-// GFX11: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, s105
+// GFX11: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s105
-// GFX11: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_lo
+// GFX11: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_lo
-// GFX11: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_hi
+// GFX11: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_hi
-// GFX11: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, ttmp15
+// GFX11: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, ttmp15
-// GFX11: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, m0
+// GFX11: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, m0
-// GFX11: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_lo
+// GFX11: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_lo
-// GFX11: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_hi
+// GFX11: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_hi
-// GFX11: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, null
+// GFX11: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, null
-// GFX11: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, -1
+// GFX11: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, -1
-// GFX11: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, 0.5
+// GFX11: v_not_b16_e64 v5.l, 0.5                 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, 0.5
-// GFX11: v_not_b16_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, src_scc
+// GFX11: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, src_scc
-// GFX11: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16_e64 v255.l, 0xfe0b
+// GFX11: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_not_b16_e64 v255, 0xfe0b
-// GFX11: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+v_not_b16_e64 v255.h, 0xfe0b
+// GFX11: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_not_b32_e64 v5, v1
 // GFX11: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
@@ -2938,50 +2980,59 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX11: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX11: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX11: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.l
+// GFX11: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX11: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX11: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX11: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX11: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX11: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX11: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX11: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX11: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX11: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX11: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX11: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX11: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX11: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX11: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX11: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX11: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX11: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX11: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX11: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX11: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX11: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX11: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX11: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32_e64 v5, v1
 // GFX11: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
@@ -3190,95 +3241,107 @@ v_rsq_f64_e64 v[5:6], -|src_scc| mul:4
 v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX11: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_sat_pk_u8_i16_e64 v5, v1
-// GFX11: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, v1
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, v1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5.l, v255
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, v255          ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, v255
-// GFX11: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, s1
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, s1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, s1
-// GFX11: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, s105
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, s105          ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, s105
-// GFX11: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, vcc_lo
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, vcc_lo        ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, vcc_lo
-// GFX11: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, vcc_hi
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, vcc_hi        ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, vcc_hi
-// GFX11: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, ttmp15
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, ttmp15        ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, ttmp15
-// GFX11: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, m0
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, m0            ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, m0
-// GFX11: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, exec_lo
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, exec_lo       ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, exec_lo
-// GFX11: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, exec_hi
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, exec_hi       ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, exec_hi
-// GFX11: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, null
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, null          ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, null
-// GFX11: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, -1
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, -1            ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, -1
-// GFX11: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, 0.5
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, 0.5           ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, 0.5
-// GFX11: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, src_scc
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, src_scc       ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, src_scc
-// GFX11: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v255.l, 0xfe0b
+// GFX11: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b      ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v255, 0xfe0b
-// GFX11: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_sat_pk_u8_i16_e64 v255.h, 0xfe0b
+// GFX11: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sin_f16_e64 v5, v1
-// GFX11: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sin_f16_e64 v5.l, v1.l
+// GFX11: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, v255
-// GFX11: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+v_sin_f16_e64 v5.l, v255.l
+// GFX11: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, s1
-// GFX11: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s1
+// GFX11: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, s105
-// GFX11: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s105
+// GFX11: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_lo
-// GFX11: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_lo
+// GFX11: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_hi
-// GFX11: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_hi
+// GFX11: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, ttmp15
-// GFX11: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, ttmp15
+// GFX11: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, m0
-// GFX11: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, m0
+// GFX11: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_lo
-// GFX11: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_lo
+// GFX11: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_hi
-// GFX11: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_hi
+// GFX11: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, null
-// GFX11: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, null
+// GFX11: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, -1
-// GFX11: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, -1
+// GFX11: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, 0.5 mul:2
-// GFX11: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+v_sin_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
-v_sin_f16_e64 v5, src_scc mul:4
-// GFX11: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+v_sin_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
-v_sin_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32_e64 v5, v1
 // GFX11: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
@@ -3451,50 +3514,59 @@ v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4
 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX11: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_trunc_f16_e64 v5, v1
-// GFX11: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, v1.l
+// GFX11: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, v255.l
+// GFX11: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, s1
+// GFX11: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, s105
+// GFX11: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, v255
-// GFX11: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_lo
+// GFX11: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s1
-// GFX11: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_hi
+// GFX11: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s105
-// GFX11: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, ttmp15
+// GFX11: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_lo
-// GFX11: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, m0
+// GFX11: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_hi
-// GFX11: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_lo
+// GFX11: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, ttmp15
-// GFX11: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_hi
+// GFX11: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, m0
-// GFX11: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, null
+// GFX11: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_lo
-// GFX11: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, -1
+// GFX11: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_hi
-// GFX11: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
-v_trunc_f16_e64 v5, null
-// GFX11: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
-v_trunc_f16_e64 v5, -1
-// GFX11: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_trunc_f16_e64 v5, 0.5 mul:2
-// GFX11: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+v_trunc_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v5, src_scc mul:4
-// GFX11: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+v_trunc_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_trunc_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32_e64 v5, v1
 // GFX11: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
index a0d11c9..ee82fa3 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
@@ -255,3 +255,127 @@ image_store_pck v5, v1, s[8:15] dmask:0x1 th:TH_STORE_NT
 
 image_store_mip_pck v5, [v0, v1], s[8:15] dmask:0x1 th:TH_STORE_NT
 // NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+// null is not allowed as SRSRC or SSAMP
+image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], [v1, v2, v3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], [v1, v2, v3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], null
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], null
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
index 668f767..2ef0274 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
@@ -541,6 +541,25 @@ s_load_b512 s[20:35], s[2:3], m0
 s_load_b512 s[20:35], s[2:3], 0x0
 // GFX12: s_load_b512 s[20:35], s[2:3], 0x0       ; encoding: [0x01,0x85,0x00,0xf4,0x00,0x00,0x00,0xf8]
 
+// null as dst
+s_load_b32 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b96 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b256 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b512 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
 s_buffer_load_b32 s5, s[4:7], s0
 // GFX12: s_buffer_load_b32 s5, s[4:7], s0 offset:0x0 ; encoding: [0x42,0x01,0x02,0xf4,0x00,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s
new file mode 100644
index 0000000..49d7c72
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s
@@ -0,0 +1,49 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s
+
+s_buffer_load_b32 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b64 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b128 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b256 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b512 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dword s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx2 s[4:5], null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx4 s[4:7], null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx8 s[4:11], null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx16 s[4:19], null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_atc_probe_buffer 7, null, s2
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_prefetch_data null, 100, s10, 7
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_i8 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_u8 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_i16 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_u16 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s
new file mode 100644
index 0000000..040119c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s
@@ -0,0 +1,49 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s
+
+tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s
new file mode 100644
index 0000000..2c9ce7a
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s
@@ -0,0 +1,220 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s
+
+buffer_atomic_add_f32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_b32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_b64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_b32 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_b64 v[5:8], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cond_sub_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_i32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_i64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_num_f32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_i32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_i64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_num_f32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_b32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_b64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_f16 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_clamp_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_b32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_b64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_b32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_b64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_b128 v[5:8], v0, null, s3 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_b32 v5, v0, null, s3 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b64 v[1:2], v0, null, s4 idxen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b96 v[1:3], v0, null, s4 idxen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_b16 v5, v0, null, s3 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xy v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_i16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_u16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b128 v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b32 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b64 v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b96 v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xy v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_b8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_x v1, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index edd3b91..e21e5bf 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -265,50 +265,62 @@ v_clz_i32_u32 v5, src_scc
 v_clz_i32_u32 v255, 0xaf123456
 // GFX12: v_clz_i32_u32_e32 v255, 0xaf123456 ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cos_f16 v5, v1
-// GFX12: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e]
+v_cos_f16 v5.l, v1.l
+// GFX12: v_cos_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc3,0x0a,0x7e]
 
-v_cos_f16 v5, v127
-// GFX12: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e]
+v_cos_f16 v5.l, v127.l
+// GFX12: v_cos_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
-v_cos_f16 v5, s1
-// GFX12: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, s1
+// GFX12: v_cos_f16_e32 v5.l, s1 ; encoding: [0x01,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, s105
-// GFX12: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, s105
+// GFX12: v_cos_f16_e32 v5.l, s105 ; encoding: [0x69,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, vcc_lo
-// GFX12: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, vcc_lo
+// GFX12: v_cos_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, vcc_hi
-// GFX12: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, vcc_hi
+// GFX12: v_cos_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, ttmp15
-// GFX12: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, ttmp15
+// GFX12: v_cos_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, m0
-// GFX12: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, m0
+// GFX12: v_cos_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, exec_lo
-// GFX12: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, exec_lo
+// GFX12: v_cos_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, exec_hi
-// GFX12: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, exec_hi
+// GFX12: v_cos_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, null
-// GFX12: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, null
+// GFX12: v_cos_f16_e32 v5.l, null ; encoding: [0x7c,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, -1
-// GFX12: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, -1
+// GFX12: v_cos_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, 0.5
-// GFX12: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, 0.5
+// GFX12: v_cos_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, src_scc
-// GFX12: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, src_scc
+// GFX12: v_cos_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
-v_cos_f16 v127, 0xfe0b
-// GFX12: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_cos_f16 v127.l, 0xfe0b
+// GFX12: v_cos_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16 v5.l, v1.h
+// GFX12: v_cos_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5.l, v127.h
+// GFX12: v_cos_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5.h, src_scc
+// GFX12: v_cos_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7f]
+
+v_cos_f16 v127.h, 0xfe0b
+// GFX12: v_cos_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32 v5, v1
 // GFX12: v_cos_f32_e32 v5, v1 ; encoding: [0x01,0x6d,0x0a,0x7e]
@@ -1338,11 +1350,11 @@ v_cvt_i32_f64 v5, src_scc
 v_cvt_i32_f64 v255, 0xaf123456
 // GFX12: v_cvt_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16 v5, v1
-// GFX12: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e]
+v_cvt_i32_i16 v5, v1.l
+// GFX12: v_cvt_i32_i16_e32 v5, v1.l ; encoding: [0x01,0xd5,0x0a,0x7e]
 
-v_cvt_i32_i16 v5, v127
-// GFX12: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e]
+v_cvt_i32_i16 v5, v127.l
+// GFX12: v_cvt_i32_i16_e32 v5, v127.l ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, s1
 // GFX12: v_cvt_i32_i16_e32 v5, s1 ; encoding: [0x01,0xd4,0x0a,0x7e]
@@ -1384,6 +1396,12 @@ v_cvt_i32_i16 v5, src_scc
 v_cvt_i32_i16 v255, 0xfe0b
 // GFX12: v_cvt_i32_i16_e32 v255, 0xfe0b ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+v_cvt_i32_i16 v5, v1.h
+// GFX12: v_cvt_i32_i16_e32 v5, v1.h ; encoding: [0x81,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, v127.h
+// GFX12: v_cvt_i32_i16_e32 v5, v127.h ; encoding: [0xff,0xd5,0x0a,0x7e]
+
 v_cvt_nearest_i32_f32 v5, v1
 // GFX12: v_cvt_nearest_i32_f32_e32 v5, v1 ; encoding: [0x01,0x19,0x0a,0x7e]
 
@@ -1771,11 +1789,11 @@ v_cvt_u32_f64 v5, src_scc
 v_cvt_u32_f64 v255, 0xaf123456
 // GFX12: v_cvt_u32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16 v5, v1
-// GFX12: v_cvt_u32_u16_e32 v5, v1 ; encoding: [0x01,0xd7,0x0a,0x7e]
+v_cvt_u32_u16 v5, v1.l
+// GFX12: v_cvt_u32_u16_e32 v5, v1.l ; encoding: [0x01,0xd7,0x0a,0x7e]
 
-v_cvt_u32_u16 v5, v127
-// GFX12: v_cvt_u32_u16_e32 v5, v127 ; encoding: [0x7f,0xd7,0x0a,0x7e]
+v_cvt_u32_u16 v5, v127.l
+// GFX12: v_cvt_u32_u16_e32 v5, v127.l ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, s1
 // GFX12: v_cvt_u32_u16_e32 v5, s1 ; encoding: [0x01,0xd6,0x0a,0x7e]
@@ -1817,6 +1835,12 @@ v_cvt_u32_u16 v5, src_scc
 v_cvt_u32_u16 v255, 0xfe0b
 // GFX12: v_cvt_u32_u16_e32 v255, 0xfe0b ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+v_cvt_u32_u16 v5, v1.h
+// GFX12: v_cvt_u32_u16_e32 v5, v1.h ; encoding: [0x81,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, v127.h
+// GFX12: v_cvt_u32_u16_e32 v5, v127.h ; encoding: [0xff,0xd7,0x0a,0x7e]
+
 v_exp_f16 v5.l, v1.l
 // GFX12: v_exp_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb1,0x0a,0x7e]
 
@@ -2168,50 +2192,62 @@ v_floor_f64 v[5:6], src_scc
 v_floor_f64 v[254:255], 0xaf123456
 // GFX12: v_floor_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
-v_fract_f16 v5, v1
-// GFX12: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e]
+v_fract_f16 v5.l, v1.l
+// GFX12: v_fract_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, v127.l
+// GFX12: v_fract_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbf,0x0a,0x7e]
 
-v_fract_f16 v5, v127
-// GFX12: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e]
+v_fract_f16 v5.l, s1
+// GFX12: v_fract_f16_e32 v5.l, s1 ; encoding: [0x01,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, s1
-// GFX12: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, s105
+// GFX12: v_fract_f16_e32 v5.l, s105 ; encoding: [0x69,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, s105
-// GFX12: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, vcc_lo
+// GFX12: v_fract_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, vcc_lo
-// GFX12: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, vcc_hi
+// GFX12: v_fract_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, vcc_hi
-// GFX12: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, ttmp15
+// GFX12: v_fract_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, ttmp15
-// GFX12: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, m0
+// GFX12: v_fract_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, m0
-// GFX12: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, exec_lo
+// GFX12: v_fract_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, exec_lo
-// GFX12: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, exec_hi
+// GFX12: v_fract_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, exec_hi
-// GFX12: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, null
+// GFX12: v_fract_f16_e32 v5.l, null ; encoding: [0x7c,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, null
-// GFX12: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, -1
+// GFX12: v_fract_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, -1
-// GFX12: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, 0.5
+// GFX12: v_fract_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, 0.5
-// GFX12: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, src_scc
+// GFX12: v_fract_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, src_scc
-// GFX12: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e]
+v_fract_f16 v127.l, 0xfe0b
+// GFX12: v_fract_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_fract_f16 v127, 0xfe0b
-// GFX12: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_fract_f16 v5.l, v1.h
+// GFX12: v_fract_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, v127.h
+// GFX12: v_fract_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.h, src_scc
+// GFX12: v_fract_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7f]
+
+v_fract_f16 v127.h, 0xfe0b
+// GFX12: v_fract_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32 v5, v1
 // GFX12: v_fract_f32_e32 v5, v1 ; encoding: [0x01,0x41,0x0a,0x7e]
@@ -2432,50 +2468,62 @@ v_frexp_exp_i32_f64 v5, src_scc
 v_frexp_exp_i32_f64 v255, 0xaf123456
 // GFX12: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16 v5, v1
-// GFX12: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, v1.l
+// GFX12: v_frexp_mant_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, v127.l
+// GFX12: v_frexp_mant_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb3,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, v127
-// GFX12: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, s1
+// GFX12: v_frexp_mant_f16_e32 v5.l, s1 ; encoding: [0x01,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, s1
-// GFX12: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, s105
+// GFX12: v_frexp_mant_f16_e32 v5.l, s105 ; encoding: [0x69,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, s105
-// GFX12: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, vcc_lo
+// GFX12: v_frexp_mant_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, vcc_lo
-// GFX12: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, vcc_hi
+// GFX12: v_frexp_mant_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, vcc_hi
-// GFX12: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, ttmp15
+// GFX12: v_frexp_mant_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, ttmp15
-// GFX12: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, m0
+// GFX12: v_frexp_mant_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, m0
-// GFX12: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, exec_lo
+// GFX12: v_frexp_mant_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, exec_lo
-// GFX12: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, exec_hi
+// GFX12: v_frexp_mant_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, exec_hi
-// GFX12: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, null
+// GFX12: v_frexp_mant_f16_e32 v5.l, null ; encoding: [0x7c,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, null
-// GFX12: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, -1
+// GFX12: v_frexp_mant_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, -1
-// GFX12: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, 0.5
+// GFX12: v_frexp_mant_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, 0.5
-// GFX12: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, src_scc
+// GFX12: v_frexp_mant_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, src_scc
-// GFX12: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v127.l, 0xfe0b
+// GFX12: v_frexp_mant_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16 v127, 0xfe0b
-// GFX12: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16 v5.l, v1.h
+// GFX12: v_frexp_mant_f16_e32 v5.l, v1.h ; encoding: [0x81,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, v127.h
+// GFX12: v_frexp_mant_f16_e32 v5.l, v127.h ; encoding: [0xff,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.h, src_scc
+// GFX12: v_frexp_mant_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7f]
+
+v_frexp_mant_f16 v127.h, 0xfe0b
+// GFX12: v_frexp_mant_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1
 // GFX12: v_frexp_mant_f32_e32 v5, v1 ; encoding: [0x01,0x81,0x0a,0x7e]
@@ -2759,51 +2807,63 @@ v_movrelsd_b32 v255, v255
 v_nop
 // GFX12: v_nop ; encoding: [0x00,0x00,0x00,0x7e]
 
-v_not_b16 v5, v1
-// GFX12: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e]
+v_not_b16 v5.l, v1.l
+// GFX12: v_not_b16_e32 v5.l, v1.l ; encoding: [0x01,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, v127.l
+// GFX12: v_not_b16_e32 v5.l, v127.l ; encoding: [0x7f,0xd3,0x0a,0x7e]
 
-v_not_b16 v5, v127
-// GFX12: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e]
+v_not_b16 v5.l, s1
+// GFX12: v_not_b16_e32 v5.l, s1 ; encoding: [0x01,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, s1
-// GFX12: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, s105
+// GFX12: v_not_b16_e32 v5.l, s105 ; encoding: [0x69,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, s105
-// GFX12: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, vcc_lo
+// GFX12: v_not_b16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, vcc_lo
-// GFX12: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, vcc_hi
+// GFX12: v_not_b16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, vcc_hi
-// GFX12: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, ttmp15
+// GFX12: v_not_b16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, ttmp15
-// GFX12: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, m0
+// GFX12: v_not_b16_e32 v5.l, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, m0
-// GFX12: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, exec_lo
+// GFX12: v_not_b16_e32 v5.l, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, exec_lo
-// GFX12: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, exec_hi
+// GFX12: v_not_b16_e32 v5.l, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, exec_hi
-// GFX12: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, null
+// GFX12: v_not_b16_e32 v5.l, null ; encoding: [0x7c,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, null
-// GFX12: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, -1
+// GFX12: v_not_b16_e32 v5.l, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, -1
-// GFX12: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, 0.5
+// GFX12-ASM: v_not_b16_e32 v5.l, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e]
+// GFX12-DIS: v_not_b16_e32 v5.l, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
-v_not_b16 v5, 0.5
-// GFX12-ASM: v_not_b16_e32 v5, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e]
-// GFX12-DIS: v_not_b16_e32 v5, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+v_not_b16 v5.l, src_scc
+// GFX12: v_not_b16_e32 v5.l, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, src_scc
-// GFX12: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e]
+v_not_b16 v127.l, 0xfe0b
+// GFX12: v_not_b16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_not_b16 v127, 0xfe0b
-// GFX12: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_not_b16 v5.l, v1.h
+// GFX12: v_not_b16_e32 v5.l, v1.h ; encoding: [0x81,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, v127.h
+// GFX12: v_not_b16_e32 v5.l, v127.h ; encoding: [0xff,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.h, src_scc
+// GFX12: v_not_b16_e32 v5.h, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7f]
+
+v_not_b16 v127.h, 0xfe0b
+// GFX12: v_not_b16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_not_b32 v5, v1
 // GFX12: v_not_b32_e32 v5, v1 ; encoding: [0x01,0x6f,0x0a,0x7e]
@@ -3048,50 +3108,62 @@ v_readfirstlane_b32 ttmp15, v1
 v_readfirstlane_b32 null, v255
 // GFX12: v_readfirstlane_b32 null, v255 ; encoding: [0xff,0x05,0xf8,0x7e]
 
-v_rndne_f16 v5, v1
-// GFX12: v_rndne_f16_e32 v5, v1 ; encoding: [0x01,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.l
+// GFX12: v_rndne_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.l
+// GFX12: v_rndne_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, s1
+// GFX12: v_rndne_f16_e32 v5.l, s1 ; encoding: [0x01,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, v127
-// GFX12: v_rndne_f16_e32 v5, v127 ; encoding: [0x7f,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, s105
+// GFX12: v_rndne_f16_e32 v5.l, s105 ; encoding: [0x69,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s1
-// GFX12: v_rndne_f16_e32 v5, s1 ; encoding: [0x01,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_lo
+// GFX12: v_rndne_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s105
-// GFX12: v_rndne_f16_e32 v5, s105 ; encoding: [0x69,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_hi
+// GFX12: v_rndne_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_lo
-// GFX12: v_rndne_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, ttmp15
+// GFX12: v_rndne_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_hi
-// GFX12: v_rndne_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, m0
+// GFX12: v_rndne_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, ttmp15
-// GFX12: v_rndne_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_lo
+// GFX12: v_rndne_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, m0
-// GFX12: v_rndne_f16_e32 v5, m0 ; encoding: [0x7d,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_hi
+// GFX12: v_rndne_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_lo
-// GFX12: v_rndne_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, null
+// GFX12: v_rndne_f16_e32 v5.l, null ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_hi
-// GFX12: v_rndne_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, -1
+// GFX12: v_rndne_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, null
-// GFX12: v_rndne_f16_e32 v5, null ; encoding: [0x7c,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, 0.5
+// GFX12: v_rndne_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, -1
-// GFX12: v_rndne_f16_e32 v5, -1 ; encoding: [0xc1,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, src_scc
+// GFX12: v_rndne_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, 0.5
-// GFX12: v_rndne_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+v_rndne_f16 v127.l, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v5, src_scc
-// GFX12: v_rndne_f16_e32 v5, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.h
+// GFX12: v_rndne_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v127, 0xfe0b
-// GFX12: v_rndne_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v5.l, v127.h
+// GFX12: v_rndne_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.h, src_scc
+// GFX12: v_rndne_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+v_rndne_f16 v127.h, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
 // GFX12: v_rndne_f32_e32 v5, v1 ; encoding: [0x01,0x47,0x0a,0x7e]
@@ -3301,94 +3373,127 @@ v_rsq_f64 v[254:255], 0xaf123456
 // GFX12: v_rsq_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_sat_pk_u8_i16 v5, v1
-// GFX12: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, v1 ; encoding: [0x01,0xc5,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, v255
-// GFX12: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, v255 ; encoding: [0xff,0xc5,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, s1
-// GFX12: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, s1 ; encoding: [0x01,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, s105
-// GFX12: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, s105 ; encoding: [0x69,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, vcc_lo
-// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, vcc_hi
-// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, ttmp15
-// GFX12: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, m0
-// GFX12: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, exec_lo
-// GFX12: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, exec_hi
-// GFX12: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, null
-// GFX12: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, null ; encoding: [0x7c,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, -1
-// GFX12: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, 0.5
-// GFX12: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, src_scc
-// GFX12: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v127, 0xfe0b
-// GFX12: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sat_pk_u8_i16 v5.h, src_scc
+// GFX12: v_sat_pk_u8_i16_e32 v5.h, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7f]
+
+v_sat_pk_u8_i16 v127.h, 0xfe0b
+// GFX12: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
-v_sin_f16 v5, v1
-// GFX12: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e]
+v_sin_f16 v5.l, v1.l
+// GFX12: v_sin_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc1,0x0a,0x7e]
 
-v_sin_f16 v5, v127
-// GFX12: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e]
+v_sin_f16 v5.l, v127.l
+// GFX12: v_sin_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
-v_sin_f16 v5, s1
-// GFX12: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, s1
+// GFX12: v_sin_f16_e32 v5.l, s1 ; encoding: [0x01,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, s105
-// GFX12: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, s105
+// GFX12: v_sin_f16_e32 v5.l, s105 ; encoding: [0x69,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, vcc_lo
-// GFX12: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, vcc_lo
+// GFX12: v_sin_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, vcc_hi
-// GFX12: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, vcc_hi
+// GFX12: v_sin_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, ttmp15
-// GFX12: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, ttmp15
+// GFX12: v_sin_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, m0
-// GFX12: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, m0
+// GFX12: v_sin_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, exec_lo
-// GFX12: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, exec_lo
+// GFX12: v_sin_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, exec_hi
-// GFX12: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, exec_hi
+// GFX12: v_sin_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, null
-// GFX12: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, null
+// GFX12: v_sin_f16_e32 v5.l, null ; encoding: [0x7c,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, -1
-// GFX12: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, -1
+// GFX12: v_sin_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, 0.5
-// GFX12: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, 0.5
+// GFX12: v_sin_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, src_scc
-// GFX12: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, src_scc
+// GFX12: v_sin_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
-v_sin_f16 v127, 0xfe0b
-// GFX12: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_sin_f16 v127.l, 0xfe0b
+// GFX12: v_sin_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v5.l, v1.h
+// GFX12: v_sin_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5.l, v127.h
+// GFX12: v_sin_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5.h, src_scc
+// GFX12: v_sin_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7f]
+
+v_sin_f16 v127.h, 0xfe0b
+// GFX12: v_sin_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32 v5, v1
 // GFX12: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e]
@@ -3582,50 +3687,62 @@ v_swaprel_b32 v5, v1
 v_swaprel_b32 v255, v255
 // GFX12: v_swaprel_b32 v255, v255 ; encoding: [0xff,0xd1,0xfe,0x7f]
 
-v_trunc_f16 v5, v1
-// GFX12: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e]
+v_trunc_f16 v5.l, v1.l
+// GFX12: v_trunc_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, v127.l
+// GFX12: v_trunc_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, s1
+// GFX12: v_trunc_f16_e32 v5.l, s1 ; encoding: [0x01,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, s105
+// GFX12: v_trunc_f16_e32 v5.l, s105 ; encoding: [0x69,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, vcc_lo
+// GFX12: v_trunc_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, v127
-// GFX12: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e]
+v_trunc_f16 v5.l, vcc_hi
+// GFX12: v_trunc_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, s1
-// GFX12: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, ttmp15
+// GFX12: v_trunc_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, s105
-// GFX12: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, m0
+// GFX12: v_trunc_f16_e32 v5.l, m0 ; encoding: [0x7d,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, vcc_lo
-// GFX12: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, exec_lo
+// GFX12: v_trunc_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, vcc_hi
-// GFX12: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, exec_hi
+// GFX12: v_trunc_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, ttmp15
-// GFX12: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, null
+// GFX12: v_trunc_f16_e32 v5.l, null ; encoding: [0x7c,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, m0
-// GFX12: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, -1
+// GFX12: v_trunc_f16_e32 v5.l, -1 ; encoding: [0xc1,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, exec_lo
-// GFX12: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, 0.5
+// GFX12: v_trunc_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, exec_hi
-// GFX12: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, src_scc
+// GFX12: v_trunc_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, null
-// GFX12: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e]
+v_trunc_f16 v127.l, 0xfe0b
+// GFX12: v_trunc_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_trunc_f16 v5, -1
-// GFX12: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, v1.h
+// GFX12: v_trunc_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbb,0x0a,0x7e]
 
-v_trunc_f16 v5, 0.5
-// GFX12: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, v127.h
+// GFX12: v_trunc_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbb,0x0a,0x7e]
 
-v_trunc_f16 v5, src_scc
-// GFX12: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e]
+v_trunc_f16 v5.h, src_scc
+// GFX12: v_trunc_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xba,0x0a,0x7f]
 
-v_trunc_f16 v127, 0xfe0b
-// GFX12: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_trunc_f16 v127.h, 0xfe0b
+// GFX12: v_trunc_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32 v5, v1
 // GFX12: v_trunc_f32_e32 v5, v1 ; encoding: [0x01,0x43,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index 56b42f1..e821fb3 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -214,47 +214,53 @@ v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_cos_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cos_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_cos_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cos_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cos_f16 v5, v1 row_mirror
-// GFX12: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_mirror
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cos_f16 v5, v1 row_half_mirror
-// GFX12: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shl:1
-// GFX12: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shl:1
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shl:15
-// GFX12: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shl:15
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shr:1
-// GFX12: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shr:1
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shr:15
-// GFX12: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shr:15
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_ror:1
-// GFX12: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_ror:1
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cos_f16 v5, v1 row_ror:15
-// GFX12: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_ror:15
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cos_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cos_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_cos_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cos_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_cos_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_cos_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -970,47 +976,53 @@ v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_i32_i16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i32_i16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_i32_i16 v5, v1.l row_mirror
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_mirror
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_half_mirror
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_half_mirror
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shl:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shl:1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shl:15
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shl:15
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shr:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shr:1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shr:15
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shr:15
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_ror:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_ror:1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_ror:15
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_ror:15
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_i32_i16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_i32_i16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
 
-v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+v_cvt_i32_i16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x09,0x13]
+
+v_cvt_i32_i16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1324,47 +1336,53 @@ v_cvt_u32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cvt_u32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_u32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_cvt_u32_u16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_u32_u16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cvt_u32_u16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_u32_u16 v5, v1.l row_mirror
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_mirror
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_half_mirror
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_half_mirror
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shl:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shl:1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shl:15
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shl:15
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shr:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shr:1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shr:15
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shr:15
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_ror:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_ror:1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_ror:15
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_ror:15
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cvt_u32_u16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_u32_u16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cvt_u32_u16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_u32_u16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_u32_u16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
 
-v_cvt_u32_u16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+v_cvt_u32_u16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_u32_u16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x09,0x13]
+
+v_cvt_u32_u16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_u32_u16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_exp_f16 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_exp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1660,47 +1678,53 @@ v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_fract_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_fract_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_fract_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_fract_f16 v5.l, v1.l row_mirror
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_fract_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_fract_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_fract_f16 v5, v1 row_mirror
-// GFX12: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shl:1
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_fract_f16 v5, v1 row_half_mirror
-// GFX12: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shl:15
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shl:1
-// GFX12: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shr:1
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shl:15
-// GFX12: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shr:15
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shr:1
-// GFX12: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_ror:1
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shr:15
-// GFX12: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_ror:15
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_ror:1
-// GFX12: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_fract_f16 v5, v1 row_ror:15
-// GFX12: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_fract_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_fract_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_fract_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_fract_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1834,47 +1858,53 @@ v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f
 v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l row_mirror
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_mirror
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shl:1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_half_mirror
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shl:15
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shl:1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shr:1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shl:15
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shr:15
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shr:1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_ror:1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shr:15
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_ror:15
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_ror:1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_ror:15
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_frexp_mant_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_frexp_mant_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_frexp_mant_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2212,47 +2242,53 @@ v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_not_b16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_not_b16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_not_b16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_not_b16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_not_b16 v5, v1 row_mirror
-// GFX12: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_not_b16 v5.l, v1.l row_mirror
+// GFX12: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_not_b16 v5, v1 row_half_mirror
-// GFX12: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_not_b16 v5.l, v1.l row_half_mirror
+// GFX12: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_not_b16 v5, v1 row_shl:1
-// GFX12: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shl:1
+// GFX12: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_not_b16 v5, v1 row_shl:15
-// GFX12: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shl:15
+// GFX12: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_not_b16 v5, v1 row_shr:1
-// GFX12: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shr:1
+// GFX12: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_not_b16 v5, v1 row_shr:15
-// GFX12: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shr:15
+// GFX12: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_not_b16 v5, v1 row_ror:1
-// GFX12: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_not_b16 v5.l, v1.l row_ror:1
+// GFX12: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_not_b16 v5, v1 row_ror:15
-// GFX12: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_ror:15
+// GFX12: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_not_b16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_not_b16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_not_b16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+v_not_b16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_not_b16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_not_b16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_not_b32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2422,47 +2458,53 @@ v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rndne_f16 v5.l, v1.l row_mirror
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_mirror
-// GFX12: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_half_mirror
-// GFX12: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rndne_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rndne_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rndne_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_rndne_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2632,47 +2674,59 @@ v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30]
 
-v_sin_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x09,0x13]
+
+v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_sin_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_sin_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_sin_f16 v5, v1 row_mirror
-// GFX12: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_sin_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_sin_f16 v5, v1 row_half_mirror
-// GFX12: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_mirror
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shl:1
-// GFX12: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shl:15
-// GFX12: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shl:1
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shr:1
-// GFX12: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shl:15
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shr:15
-// GFX12: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shr:1
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_sin_f16 v5, v1 row_ror:1
-// GFX12: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shr:15
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_ror:15
-// GFX12: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_ror:1
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_ror:15
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_sin_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_sin_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_sin_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sin_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_sin_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_sin_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_sin_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2800,47 +2854,53 @@ v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_trunc_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_trunc_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16 v5.l, v1.l row_mirror
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_trunc_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_trunc_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_mirror
-// GFX12: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shl:1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_half_mirror
-// GFX12: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shl:15
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shl:1
-// GFX12: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shr:1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shl:15
-// GFX12: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shr:15
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shr:1
-// GFX12: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_ror:1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shr:15
-// GFX12: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_ror:15
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_ror:1
-// GFX12: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_ror:15
-// GFX12: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_trunc_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_trunc_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_trunc_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_trunc_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index 09f3069..ecf408e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -49,14 +49,20 @@ v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_cos_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cos_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_cos_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -244,14 +250,20 @@ v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 
-v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_cvt_i32_i16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -334,14 +346,20 @@ v_cvt_u32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_u32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_u32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_u32_u16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 
-v_cvt_u32_u16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_cvt_u32_u16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_u32_u16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_exp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_exp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -406,14 +424,20 @@ v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_fract_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_fract_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -448,14 +472,20 @@ v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -529,14 +559,20 @@ v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_not_b16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_not_b16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_not_b16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -574,14 +610,20 @@ v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_rndne_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -619,14 +661,26 @@ v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
 
-v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_sin_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_sin_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -655,14 +709,20 @@ v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_trunc_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_trunc_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
index 0ccad9c..ad08a5c 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 ; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
 
@@ -25,6 +26,12 @@ v_ceil_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_cos_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_cos_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_cos_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -34,6 +41,24 @@ v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_cos_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_cos_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -43,6 +68,24 @@ v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_cos_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_cvt_f16_f32_e32 v128.h, 0xaf123456
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
@@ -250,6 +293,24 @@ v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
 
+v_cvt_i32_i16_e32 v5, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_cvt_norm_i16_f16_e32 v128.h, 0xfe0b
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
@@ -385,6 +446,24 @@ v_cvt_u32_u16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_u32_u16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
 
+v_cvt_u32_u16_e32 v5, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_exp_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -430,6 +509,12 @@ v_floor_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_fract_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_fract_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_fract_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -439,6 +524,24 @@ v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_fract_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_fract_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -448,6 +551,24 @@ v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_fract_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_frexp_exp_i16_f16_e32 v128.h, 0xfe0b
 // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
@@ -493,6 +614,12 @@ v_frexp_exp_i16_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_frexp_mant_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_frexp_mant_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
 v_frexp_mant_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -502,6 +629,24 @@ v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction
 
+v_frexp_mant_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
 v_frexp_mant_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -511,6 +656,24 @@ v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction
 
+v_frexp_mant_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
 v_log_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -535,6 +698,12 @@ v_log_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_not_b16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
 
+v_not_b16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_not_b16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -544,6 +713,24 @@ v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
 
+v_not_b16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_not_b16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
@@ -553,6 +740,24 @@ v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
+v_not_b16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_rcp_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -577,6 +782,12 @@ v_rcp_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_rndne_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_rndne_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -586,6 +797,24 @@ v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -595,6 +824,24 @@ v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_rsq_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -625,9 +872,33 @@ v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
 v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:30: error: invalid operand for instruction
 
+v_sat_pk_u8_i16_e32 v199.h, v5
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.h, v5 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_sin_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_sin_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_sin_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -637,6 +908,24 @@ v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_sin_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_sin_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -646,6 +935,24 @@ v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_sin_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_sqrt_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -685,6 +992,12 @@ v_swap_b16_e32 v128.l, v0.l
 v_trunc_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_trunc_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_trunc_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -694,6 +1007,24 @@ v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_trunc_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_trunc_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -702,3 +1033,21 @@ v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 
 v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
index f220ec2..cc5870f 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX12 --implicit-check-not=_e32 %s
 
 v_ceil_f16 v128, 0xfe0b
@@ -67,71 +67,137 @@ v_ceil_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_ceil_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_ceil_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cos_f16 v128, 0xfe0b
-// GFX12: v_cos_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_cos_f16 v128.h, 0xfe0b
+// GFX12: v_cos_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_cos_f16 v255, -1
-// GFX12: v_cos_f16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16 v128.l, 0xfe0b
+// GFX12: v_cos_f16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_cos_f16 v255, 0.5
-// GFX12: v_cos_f16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00]
+v_cos_f16 v255.h, -1
+// GFX12: v_cos_f16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16 v255, exec_hi
-// GFX12: v_cos_f16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16 v255.h, 0.5
+// GFX12: v_cos_f16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe1,0xd5,0xf0,0x00,0x00,0x00]
 
-v_cos_f16 v255, exec_lo
-// GFX12: v_cos_f16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16 v255.h, exec_hi
+// GFX12: v_cos_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16 v255, m0
-// GFX12: v_cos_f16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16 v255.h, exec_lo
+// GFX12: v_cos_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16 v255, null
-// GFX12: v_cos_f16_e64 v255, null                ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16 v255.h, m0
+// GFX12: v_cos_f16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16 v255, s1
-// GFX12: v_cos_f16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16 v255.h, null
+// GFX12: v_cos_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16 v255, s105
-// GFX12: v_cos_f16_e64 v255, s105                ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16 v255.h, s1
+// GFX12: v_cos_f16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16 v255, src_scc
-// GFX12: v_cos_f16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00]
+v_cos_f16 v255.h, s105
+// GFX12: v_cos_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16 v255, ttmp15
-// GFX12: v_cos_f16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16 v255.h, src_scc
+// GFX12: v_cos_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xfd,0x00,0x00,0x00]
 
-v_cos_f16 v255, v1
-// GFX12: v_cos_f16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16 v255.h, ttmp15
+// GFX12: v_cos_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_f16 v255.h, v1.h
+// GFX12: v_cos_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v255, v127
-// GFX12: v_cos_f16_e64 v255, v127                ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00]
+v_cos_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cos_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_cos_f16 v255.h, v127.h
+// GFX12: v_cos_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x7f,0x01,0x00,0x00]
 
-v_cos_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_cos_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_cos_f16 v255, vcc_hi
-// GFX12: v_cos_f16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_cos_f16 v255, vcc_lo
-// GFX12: v_cos_f16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16 v255.h, vcc_hi
+// GFX12: v_cos_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199
-// GFX12: v_cos_f16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+v_cos_f16 v255.h, vcc_lo
+// GFX12: v_cos_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cos_f16 v255.l, -1
+// GFX12: v_cos_f16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cos_f16 v255.l, 0.5
+// GFX12: v_cos_f16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, exec_hi
+// GFX12: v_cos_f16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, exec_lo
+// GFX12: v_cos_f16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, m0
+// GFX12: v_cos_f16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, null
+// GFX12: v_cos_f16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, s1
+// GFX12: v_cos_f16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, s105
+// GFX12: v_cos_f16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, src_scc
+// GFX12: v_cos_f16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, ttmp15
+// GFX12: v_cos_f16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, v1.l
+// GFX12: v_cos_f16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cos_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cos_f16 v255.l, v127.l
+// GFX12: v_cos_f16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00]
+
+v_cos_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_cos_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_cos_f16 v255.l, vcc_hi
+// GFX12: v_cos_f16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, vcc_lo
+// GFX12: v_cos_f16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_f16 v5.h, v199.h
+// GFX12: v_cos_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cos_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cos_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_cos_f16 v5.l, v199.l
+// GFX12: v_cos_f16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cos_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cos_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_cvt_f16_f32 v128.h, 0xaf123456
 // GFX12: v_cvt_f16_f32_e64 v128.h, 0xaf123456 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
@@ -622,14 +688,23 @@ v_cvt_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX12: v_cvt_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd3,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_i32_i16 v5, v199
-// GFX12: v_cvt_i32_i16_e64 v5, v199              ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00]
+v_cvt_i32_i16 v5, v199.h
+// GFX12: v_cvt_i32_i16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_i32_i16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_i32_i16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cvt_i32_i16 v5, v199.l
+// GFX12: v_cvt_i32_i16_e64 v5, v199.l            ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00]
 
-v_cvt_i32_i16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cvt_i32_i16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_cvt_norm_i16_f16 v128.h, 0xfe0b
 // GFX12: v_cvt_norm_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe3,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1027,14 +1102,23 @@ v_cvt_u16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_u16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX12: v_cvt_u16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd2,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_u32_u16 v5, v199
-// GFX12: v_cvt_u32_u16_e64 v5, v199              ; encoding: [0x05,0x00,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+v_cvt_u32_u16 v5, v199.h
+// GFX12: v_cvt_u32_u16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_cvt_u32_u16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_u32_u16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cvt_u32_u16 v5, v199.l
+// GFX12: v_cvt_u32_u16_e64 v5, v199.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_exp_f16 v128, 0xfe0b
 // GFX12: v_exp_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd8,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1168,71 +1252,137 @@ v_floor_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_floor_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_floor_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_fract_f16 v128, 0xfe0b
-// GFX12: v_fract_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_fract_f16 v128.h, 0xfe0b
+// GFX12: v_fract_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v128.l, 0xfe0b
+// GFX12: v_fract_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v255.h, -1
+// GFX12: v_fract_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, 0.5
+// GFX12: v_fract_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xf0,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, exec_hi
+// GFX12: v_fract_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, exec_lo
+// GFX12: v_fract_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, m0
+// GFX12: v_fract_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, null
+// GFX12: v_fract_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, s1
+// GFX12: v_fract_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x01,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, s105
+// GFX12: v_fract_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x69,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, src_scc
+// GFX12: v_fract_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xfd,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, ttmp15
+// GFX12: v_fract_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, v1.h
+// GFX12: v_fract_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_fract_f16 v255, -1
-// GFX12: v_fract_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_fract_f16 v255, 0.5
-// GFX12: v_fract_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h
+// GFX12: v_fract_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x7f,0x01,0x00,0x00]
 
-v_fract_f16 v255, exec_hi
-// GFX12: v_fract_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_fract_f16 v255, exec_lo
-// GFX12: v_fract_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_fract_f16 v255, m0
-// GFX12: v_fract_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16 v255.h, vcc_hi
+// GFX12: v_fract_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16 v255, null
-// GFX12: v_fract_f16_e64 v255, null              ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16 v255.h, vcc_lo
+// GFX12: v_fract_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16 v255, s1
-// GFX12: v_fract_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16 v255.l, -1
+// GFX12: v_fract_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16 v255, s105
-// GFX12: v_fract_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16 v255.l, 0.5
+// GFX12: v_fract_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00]
 
-v_fract_f16 v255, src_scc
-// GFX12: v_fract_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00]
+v_fract_f16 v255.l, exec_hi
+// GFX12: v_fract_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16 v255, ttmp15
-// GFX12: v_fract_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16 v255.l, exec_lo
+// GFX12: v_fract_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1
-// GFX12: v_fract_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16 v255.l, m0
+// GFX12: v_fract_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_fract_f16 v255.l, null
+// GFX12: v_fract_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_fract_f16 v255.l, s1
+// GFX12: v_fract_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127
-// GFX12: v_fract_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00]
+v_fract_f16 v255.l, s105
+// GFX12: v_fract_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_fract_f16 v255.l, src_scc
+// GFX12: v_fract_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_fract_f16 v255.l, ttmp15
+// GFX12: v_fract_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16 v255, vcc_hi
-// GFX12: v_fract_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16 v255.l, v1.l
+// GFX12: v_fract_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
-v_fract_f16 v255, vcc_lo
-// GFX12: v_fract_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_fract_f16 v5, v199
-// GFX12: v_fract_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+v_fract_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_fract_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_fract_f16 v255.l, v127.l
+// GFX12: v_fract_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00]
 
-v_fract_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_fract_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_fract_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_fract_f16 v255.l, vcc_hi
+// GFX12: v_fract_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+
+v_fract_f16 v255.l, vcc_lo
+// GFX12: v_fract_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f16 v5.h, v199.h
+// GFX12: v_fract_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+
+v_fract_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_fract_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_fract_f16 v5.l, v199.l
+// GFX12: v_fract_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+
+v_fract_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_fract_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_frexp_exp_i16_f16 v128.h, 0xfe0b
 // GFX12: v_frexp_exp_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xda,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1366,71 +1516,137 @@ v_frexp_exp_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_exp_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX12: v_frexp_exp_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xda,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v128, 0xfe0b
-// GFX12: v_frexp_mant_f16_e64 v128, 0xfe0b       ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16 v128.h, 0xfe0b
+// GFX12: v_frexp_mant_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v128.l, 0xfe0b
+// GFX12: v_frexp_mant_f16_e64 v128.l, 0xfe0b     ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, -1
+// GFX12: v_frexp_mant_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, 0.5
+// GFX12: v_frexp_mant_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, exec_hi
+// GFX12: v_frexp_mant_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, exec_lo
+// GFX12: v_frexp_mant_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, m0
+// GFX12: v_frexp_mant_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, null
+// GFX12: v_frexp_mant_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, s1
+// GFX12: v_frexp_mant_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, s105
+// GFX12: v_frexp_mant_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, src_scc
+// GFX12: v_frexp_mant_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, ttmp15
+// GFX12: v_frexp_mant_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, v1.h
+// GFX12: v_frexp_mant_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v255, -1
-// GFX12: v_frexp_mant_f16_e64 v255, -1           ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v255, 0.5
-// GFX12: v_frexp_mant_f16_e64 v255, 0.5          ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v255, exec_hi
-// GFX12: v_frexp_mant_f16_e64 v255, exec_hi      ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h
+// GFX12: v_frexp_mant_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x7f,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v255, exec_lo
-// GFX12: v_frexp_mant_f16_e64 v255, exec_lo      ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v255, m0
-// GFX12: v_frexp_mant_f16_e64 v255, m0           ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v255, null
-// GFX12: v_frexp_mant_f16_e64 v255, null         ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, vcc_hi
+// GFX12: v_frexp_mant_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, s1
-// GFX12: v_frexp_mant_f16_e64 v255, s1           ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, vcc_lo
+// GFX12: v_frexp_mant_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, s105
-// GFX12: v_frexp_mant_f16_e64 v255, s105         ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, -1
+// GFX12: v_frexp_mant_f16_e64 v255.l, -1         ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, src_scc
-// GFX12: v_frexp_mant_f16_e64 v255, src_scc      ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, 0.5
+// GFX12: v_frexp_mant_f16_e64 v255.l, 0.5        ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, ttmp15
-// GFX12: v_frexp_mant_f16_e64 v255, ttmp15       ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, exec_hi
+// GFX12: v_frexp_mant_f16_e64 v255.l, exec_hi    ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1
-// GFX12: v_frexp_mant_f16_e64 v255, v1           ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, exec_lo
+// GFX12: v_frexp_mant_f16_e64 v255.l, exec_lo    ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, m0
+// GFX12: v_frexp_mant_f16_e64 v255.l, m0         ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, null
+// GFX12: v_frexp_mant_f16_e64 v255.l, null       ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127
-// GFX12: v_frexp_mant_f16_e64 v255, v127         ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, s1
+// GFX12: v_frexp_mant_f16_e64 v255.l, s1         ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, s105
+// GFX12: v_frexp_mant_f16_e64 v255.l, s105       ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, src_scc
+// GFX12: v_frexp_mant_f16_e64 v255.l, src_scc    ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, vcc_hi
-// GFX12: v_frexp_mant_f16_e64 v255, vcc_hi       ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, ttmp15
+// GFX12: v_frexp_mant_f16_e64 v255.l, ttmp15     ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, vcc_lo
-// GFX12: v_frexp_mant_f16_e64 v255, vcc_lo       ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, v1.l
+// GFX12: v_frexp_mant_f16_e64 v255.l, v1.l       ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v5, v199
-// GFX12: v_frexp_mant_f16_e64 v5, v199           ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, v127.l
+// GFX12: v_frexp_mant_f16_e64 v255.l, v127.l     ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v255.l, vcc_hi
+// GFX12: v_frexp_mant_f16_e64 v255.l, vcc_hi     ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.l, vcc_lo
+// GFX12: v_frexp_mant_f16_e64 v255.l, vcc_lo     ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v5.h, v199.h
+// GFX12: v_frexp_mant_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v199.l
+// GFX12: v_frexp_mant_f16_e64 v5.l, v199.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_log_f16 v128, 0xfe0b
 // GFX12: v_log_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd7,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1498,71 +1714,137 @@ v_log_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_log_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_log_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_not_b16 v128, 0xfe0b
-// GFX12: v_not_b16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16 v128.h, 0xfe0b
+// GFX12: v_not_b16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v128.l, 0xfe0b
+// GFX12: v_not_b16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v255.h, -1
+// GFX12: v_not_b16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_not_b16 v255.h, 0.5
+// GFX12: v_not_b16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_not_b16 v255.h, exec_hi
+// GFX12: v_not_b16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_not_b16 v255.h, exec_lo
+// GFX12: v_not_b16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_not_b16 v255.h, m0
+// GFX12: v_not_b16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_not_b16 v255.h, null
+// GFX12: v_not_b16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_not_b16 v255.h, s1
+// GFX12: v_not_b16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0x01,0x00,0x00,0x00]
+
+v_not_b16 v255.h, s105
+// GFX12: v_not_b16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x69,0x00,0x00,0x00]
+
+v_not_b16 v255.h, src_scc
+// GFX12: v_not_b16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16 v255, -1
-// GFX12: v_not_b16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16 v255.h, ttmp15
+// GFX12: v_not_b16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16 v255, 0.5
-// GFX12: v_not_b16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h
+// GFX12: v_not_b16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
-v_not_b16 v255, exec_hi
-// GFX12: v_not_b16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_not_b16 v255, exec_lo
-// GFX12: v_not_b16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_not_b16 v255, m0
-// GFX12: v_not_b16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h
+// GFX12: v_not_b16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x7f,0x01,0x00,0x00]
 
-v_not_b16 v255, null
-// GFX12: v_not_b16_e64 v255, null                ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_not_b16 v255, s1
-// GFX12: v_not_b16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_not_b16 v255, s105
-// GFX12: v_not_b16_e64 v255, s105                ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16 v255.h, vcc_hi
+// GFX12: v_not_b16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16 v255, src_scc
-// GFX12: v_not_b16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16 v255.h, vcc_lo
+// GFX12: v_not_b16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16 v255, ttmp15
-// GFX12: v_not_b16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16 v255.l, -1
+// GFX12: v_not_b16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16 v255, v1
-// GFX12: v_not_b16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16 v255.l, 0.5
+// GFX12: v_not_b16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16 v255.l, exec_hi
+// GFX12: v_not_b16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_not_b16 v255.l, exec_lo
+// GFX12: v_not_b16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16 v255, v127
-// GFX12: v_not_b16_e64 v255, v127                ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00]
+v_not_b16 v255.l, m0
+// GFX12: v_not_b16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_not_b16 v255.l, null
+// GFX12: v_not_b16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_not_b16 v255.l, s1
+// GFX12: v_not_b16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16 v255, vcc_hi
-// GFX12: v_not_b16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16 v255.l, s105
+// GFX12: v_not_b16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16 v255, vcc_lo
-// GFX12: v_not_b16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16 v255.l, src_scc
+// GFX12: v_not_b16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16 v5, v199
-// GFX12: v_not_b16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+v_not_b16 v255.l, ttmp15
+// GFX12: v_not_b16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_not_b16 v255.l, v1.l
+// GFX12: v_not_b16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
-v_not_b16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_not_b16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_not_b16 v255.l, v127.l
+// GFX12: v_not_b16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00]
+
+v_not_b16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_not_b16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_not_b16 v255.l, vcc_hi
+// GFX12: v_not_b16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_not_b16 v255.l, vcc_lo
+// GFX12: v_not_b16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_not_b16 v5.h, v199.h
+// GFX12: v_not_b16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_not_b16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_not_b16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_not_b16 v5.l, v199.l
+// GFX12: v_not_b16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_not_b16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_not_b16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rcp_f16 v128, 0xfe0b
 // GFX12: v_rcp_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd4,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1630,71 +1912,137 @@ v_rcp_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rcp_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_rcp_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_rndne_f16 v128, 0xfe0b
-// GFX12: v_rndne_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v128.h, 0xfe0b
+// GFX12: v_rndne_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v128.l, 0xfe0b
+// GFX12: v_rndne_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v255.h, -1
+// GFX12: v_rndne_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, 0.5
+// GFX12: v_rndne_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xf0,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, exec_hi
+// GFX12: v_rndne_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, exec_lo
+// GFX12: v_rndne_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, m0
+// GFX12: v_rndne_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, null
+// GFX12: v_rndne_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, s1
+// GFX12: v_rndne_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, s105
+// GFX12: v_rndne_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v255, -1
-// GFX12: v_rndne_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16 v255.h, src_scc
+// GFX12: v_rndne_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v255, 0.5
-// GFX12: v_rndne_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+v_rndne_f16 v255.h, ttmp15
+// GFX12: v_rndne_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_hi
-// GFX12: v_rndne_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h
+// GFX12: v_rndne_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16 v255, exec_lo
-// GFX12: v_rndne_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v255, m0
-// GFX12: v_rndne_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, null
-// GFX12: v_rndne_f16_e64 v255, null              ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h
+// GFX12: v_rndne_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x7f,0x01,0x00,0x00]
 
-v_rndne_f16 v255, s1
-// GFX12: v_rndne_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_rndne_f16 v255, s105
-// GFX12: v_rndne_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, src_scc
-// GFX12: v_rndne_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_hi
+// GFX12: v_rndne_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, ttmp15
-// GFX12: v_rndne_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_lo
+// GFX12: v_rndne_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1
-// GFX12: v_rndne_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16 v255.l, -1
+// GFX12: v_rndne_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16 v255.l, 0.5
+// GFX12: v_rndne_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, exec_hi
+// GFX12: v_rndne_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127
-// GFX12: v_rndne_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+v_rndne_f16 v255.l, exec_lo
+// GFX12: v_rndne_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_rndne_f16 v255.l, m0
+// GFX12: v_rndne_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, null
+// GFX12: v_rndne_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16 v255, vcc_hi
-// GFX12: v_rndne_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16 v255.l, s1
+// GFX12: v_rndne_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v255, vcc_lo
-// GFX12: v_rndne_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16 v255.l, s105
+// GFX12: v_rndne_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199
-// GFX12: v_rndne_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+v_rndne_f16 v255.l, src_scc
+// GFX12: v_rndne_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_rndne_f16 v255.l, ttmp15
+// GFX12: v_rndne_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, v1.l
+// GFX12: v_rndne_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, v127.l
+// GFX12: v_rndne_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, vcc_hi
+// GFX12: v_rndne_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, vcc_lo
+// GFX12: v_rndne_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h
+// GFX12: v_rndne_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v199.l
+// GFX12: v_rndne_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rsq_f16 v128, 0xfe0b
 // GFX12: v_rsq_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd6,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1771,71 +2119,146 @@ v_sat_pk_u8_i16 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
 v_sat_pk_u8_i16 v199, v5 quad_perm:[3,2,1,0]
 // GFX12: v_sat_pk_u8_i16_e64_dpp v199, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
 
-v_sin_f16 v128, 0xfe0b
-// GFX12: v_sin_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_sat_pk_u8_i16 v199.h, v5
+// GFX12: v_sat_pk_u8_i16_e64 v199.h, v5 op_sel:[0,1] ; encoding: [0xc7,0x40,0xe2,0xd5,0x05,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0]
+// GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
+
+v_sin_f16 v128.h, 0xfe0b
+// GFX12: v_sin_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v128.l, 0xfe0b
+// GFX12: v_sin_f16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v255.h, -1
+// GFX12: v_sin_f16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_f16 v255.h, 0.5
+// GFX12: v_sin_f16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe0,0xd5,0xf0,0x00,0x00,0x00]
+
+v_sin_f16 v255.h, exec_hi
+// GFX12: v_sin_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16 v255, -1
-// GFX12: v_sin_f16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16 v255.h, exec_lo
+// GFX12: v_sin_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16 v255, 0.5
-// GFX12: v_sin_f16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00]
+v_sin_f16 v255.h, m0
+// GFX12: v_sin_f16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16 v255, exec_hi
-// GFX12: v_sin_f16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16 v255.h, null
+// GFX12: v_sin_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16 v255, exec_lo
-// GFX12: v_sin_f16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16 v255.h, s1
+// GFX12: v_sin_f16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16 v255, m0
-// GFX12: v_sin_f16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16 v255.h, s105
+// GFX12: v_sin_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16 v255, null
-// GFX12: v_sin_f16_e64 v255, null                ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16 v255.h, src_scc
+// GFX12: v_sin_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xfd,0x00,0x00,0x00]
 
-v_sin_f16 v255, s1
-// GFX12: v_sin_f16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16 v255.h, ttmp15
+// GFX12: v_sin_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16 v255, s105
-// GFX12: v_sin_f16_e64 v255, s105                ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h
+// GFX12: v_sin_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16 v255, src_scc
-// GFX12: v_sin_f16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v255, ttmp15
-// GFX12: v_sin_f16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_sin_f16 v255, v1
-// GFX12: v_sin_f16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sin_f16 v255.h, v127.h
+// GFX12: v_sin_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x7f,0x01,0x00,0x00]
 
-v_sin_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sin_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_sin_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sin_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_sin_f16 v255, v127
-// GFX12: v_sin_f16_e64 v255, v127                ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00]
+v_sin_f16 v255.h, vcc_hi
+// GFX12: v_sin_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_sin_f16 v255.h, vcc_lo
+// GFX12: v_sin_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_sin_f16 v255.l, -1
+// GFX12: v_sin_f16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16 v255, vcc_hi
-// GFX12: v_sin_f16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16 v255.l, 0.5
+// GFX12: v_sin_f16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sin_f16 v255, vcc_lo
-// GFX12: v_sin_f16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16 v255.l, exec_hi
+// GFX12: v_sin_f16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199
-// GFX12: v_sin_f16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+v_sin_f16 v255.l, exec_lo
+// GFX12: v_sin_f16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_sin_f16 v255.l, m0
+// GFX12: v_sin_f16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_sin_f16 v255.l, null
+// GFX12: v_sin_f16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, s1
+// GFX12: v_sin_f16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, s105
+// GFX12: v_sin_f16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, src_scc
+// GFX12: v_sin_f16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, ttmp15
+// GFX12: v_sin_f16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, v1.l
+// GFX12: v_sin_f16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_sin_f16 v255.l, v127.l
+// GFX12: v_sin_f16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00]
+
+v_sin_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_sin_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_sin_f16 v255.l, vcc_hi
+// GFX12: v_sin_f16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, vcc_lo
+// GFX12: v_sin_f16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_f16 v5.h, v199.h
+// GFX12: v_sin_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+
+v_sin_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_sin_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_sin_f16 v5.l, v199.l
+// GFX12: v_sin_f16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+
+v_sin_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_sin_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_sqrt_f16 v128, 0xfe0b
 // GFX12: v_sqrt_f16_e64 v128, 0xfe0b             ; encoding: [0x80,0x00,0xd5,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1903,68 +2326,134 @@ v_sqrt_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_sqrt_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_sqrt_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_trunc_f16 v128, 0xfe0b
-// GFX12: v_trunc_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_trunc_f16 v128.h, 0xfe0b
+// GFX12: v_trunc_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v128.l, 0xfe0b
+// GFX12: v_trunc_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v255.h, -1
+// GFX12: v_trunc_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, 0.5
+// GFX12: v_trunc_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, exec_hi
+// GFX12: v_trunc_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, exec_lo
+// GFX12: v_trunc_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, m0
+// GFX12: v_trunc_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, null
+// GFX12: v_trunc_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, s1
+// GFX12: v_trunc_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, s105
+// GFX12: v_trunc_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x69,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, src_scc
+// GFX12: v_trunc_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xfd,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, ttmp15
+// GFX12: v_trunc_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, v1.h
+// GFX12: v_trunc_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v255.h, v127.h
+// GFX12: v_trunc_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x7f,0x01,0x00,0x00]
+
+v_trunc_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_trunc_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_trunc_f16 v255.h, vcc_hi
+// GFX12: v_trunc_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, vcc_lo
+// GFX12: v_trunc_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, -1
+// GFX12: v_trunc_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, 0.5
+// GFX12: v_trunc_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, exec_hi
+// GFX12: v_trunc_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16 v255, -1
-// GFX12: v_trunc_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16 v255.l, exec_lo
+// GFX12: v_trunc_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16 v255, 0.5
-// GFX12: v_trunc_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+v_trunc_f16 v255.l, m0
+// GFX12: v_trunc_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16 v255, exec_hi
-// GFX12: v_trunc_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16 v255.l, null
+// GFX12: v_trunc_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16 v255, exec_lo
-// GFX12: v_trunc_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16 v255.l, s1
+// GFX12: v_trunc_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
-v_trunc_f16 v255, m0
-// GFX12: v_trunc_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16 v255.l, s105
+// GFX12: v_trunc_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16 v255, null
-// GFX12: v_trunc_f16_e64 v255, null              ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16 v255.l, src_scc
+// GFX12: v_trunc_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00]
 
-v_trunc_f16 v255, s1
-// GFX12: v_trunc_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16 v255.l, ttmp15
+// GFX12: v_trunc_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16 v255, s105
-// GFX12: v_trunc_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l
+// GFX12: v_trunc_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16 v255, src_scc
-// GFX12: v_trunc_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_trunc_f16 v255, ttmp15
-// GFX12: v_trunc_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_trunc_f16 v255, v1
-// GFX12: v_trunc_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16 v255.l, v127.l
+// GFX12: v_trunc_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00]
 
-v_trunc_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_trunc_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_trunc_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_trunc_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_trunc_f16 v255, v127
-// GFX12: v_trunc_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00]
+v_trunc_f16 v255.l, vcc_hi
+// GFX12: v_trunc_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_trunc_f16 v255.l, vcc_lo
+// GFX12: v_trunc_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_trunc_f16 v5.h, v199.h
+// GFX12: v_trunc_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0xc7,0x01,0x00,0x00]
 
-v_trunc_f16 v255, vcc_hi
-// GFX12: v_trunc_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_trunc_f16 v255, vcc_lo
-// GFX12: v_trunc_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_trunc_f16 v5, v199
-// GFX12: v_trunc_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00]
+v_trunc_f16 v5.l, v199.l
+// GFX12: v_trunc_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00]
 
-v_trunc_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_trunc_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_trunc_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_trunc_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index 0309b2e..3e7b7d2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -2234,50 +2234,62 @@ v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
 v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
 // GFX12: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
-v_fma_f16 v5, v1, v2, s3
-// GFX12: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+v_fma_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
 
-v_fma_f16 v5, v255, s2, s105
-// GFX12: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+v_fma_f16 v5.l, v255.l, s2, s105
+// GFX12: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
 
-v_fma_f16 v5, s1, v255, exec_hi
-// GFX12: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+v_fma_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
 
-v_fma_f16 v5, s105, s105, exec_lo
-// GFX12: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+v_fma_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_fma_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_fma_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_fma_f16 v5, m0, 0.5, m0
-// GFX12: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+v_fma_f16 v5.l, m0, 0.5, m0
+// GFX12: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_fma_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
 
-v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX12: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX12: v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX12: v_fma_f16 v5, null, exec_lo, -|0xfe0b|  ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
+// GFX12: v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX12: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX12: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX12: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX12: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX12: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX12: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX12: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
+// GFX12: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, v255.h, s2, s105
+// GFX12: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+v_fma_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX12: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_fma_f32 v5, v1, v2, s3
 // GFX12: v_fma_f32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
@@ -3164,50 +3176,62 @@ v_mad_co_u64_u32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc
 v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp
 // GFX12: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xfe,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf]
 
-v_max3_num_f16 v5, v1, v2, s3
-// GFX12: v_max3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+v_max3_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_max3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+
+v_max3_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_max3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
 
-v_max3_num_f16 v5, v255, s2, s105
-// GFX12: v_max3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+v_max3_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
 
-v_max3_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_max3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+v_max3_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_max3_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_max3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_max3_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_max3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+v_max3_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_max3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_max3_num_f16 v5, m0, 0.5, m0
-// GFX12: v_max3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
 
-v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX12: v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX12: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
+// GFX12: v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX12: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX12: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX12: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX12: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX12: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX12: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX12: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
+// GFX12: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX12: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_max3_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_max3_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+
+v_max3_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_max3_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+
+v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX12: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_max3_num_f32 v5, v1, v2, s3
 // GFX12: v_max3_num_f32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00]
@@ -3566,50 +3590,62 @@ v_max_u16 v255.l, 0xfe0b, vcc_hi
 v_max_u16 v255.h, 0xfe0b, vcc_hi
 // GFX12: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, v1, v2, s3
-// GFX12: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+v_maxmin_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+
+v_maxmin_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+
+v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maxmin_num_f16 v5, v255, s2, s105
-// GFX12: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+v_maxmin_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_maxmin_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_maxmin_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maxmin_num_f16 v5, m0, 0.5, m0
-// GFX12: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX12: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX12: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX12: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX12: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX12: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX12: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+v_maxmin_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX12: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+
+v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_maxmin_num_f32 v5, v1, v2, s3
 // GFX12: v_maxmin_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00]
@@ -4142,50 +4178,62 @@ v_med3_u32 v5, src_scc, vcc_lo, -1
 v_med3_u32 v255, 0xaf123456, vcc_hi, null
 // GFX12: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_min3_num_f16 v5, v1, v2, s3
-// GFX12: v_min3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+v_min3_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_min3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+
+v_min3_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_min3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+
+v_min3_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+
+v_min3_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_min3_num_f16 v5, v255, s2, s105
-// GFX12: v_min3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_min3_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_min3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_min3_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_min3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_min3_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_min3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+v_min3_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_min3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
 
-v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX12: v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_min3_num_f16 v5, m0, 0.5, m0
-// GFX12: v_min3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
+// GFX12: v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX12: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX12: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX12: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX12: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX12: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX12: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
+// GFX12: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX12: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+v_min3_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_min3_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2b,0xd6,0xff,0x05,0xa4,0x01]
 
-v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX12: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+v_min3_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_min3_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0x01,0xfe,0xff,0x01]
 
-v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX12: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX12: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_min3_num_f32 v5, v1, v2, s3
 // GFX12: v_min3_num_f32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00]
@@ -4544,50 +4592,62 @@ v_min_u16 v255.l, 0xfe0b, vcc_hi
 v_min_u16 v255.h, 0xfe0b, vcc_hi
 // GFX12: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, v1, v2, s3
-// GFX12: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+v_minmax_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+
+v_minmax_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+
+v_minmax_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+
+v_minmax_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+
+v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_minmax_num_f16 v5, v255, s2, s105
-// GFX12: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minmax_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+v_minmax_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minmax_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX12: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, m0, 0.5, m0
-// GFX12: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX12: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX12: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX12: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minmax_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
 
-v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX12: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_minmax_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
 
-v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX12: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX12: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_minmax_num_f32 v5, v1, v2, s3
 // GFX12: v_minmax_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
index 59cb1a4..ffcf651 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
@@ -6,11 +6,11 @@ v_min3_f32 v5, v1, v2, v3
 v_max3_f32 v5, v1, v2, v3
 // GFX12: v_max3_num_f32 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x04]
 
-v_min3_f16 v5, v1, v2, v3
-// GFX12: v_min3_num_f16 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04]
+v_min3_f16 v5.l, v1.l, v2.l, v3.l
+// GFX12: v_min3_num_f16 v5.l, v1.l, v2.l, v3.l   ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04]
 
-v_max3_f16 v5, v1, v2, v3
-// GFX12: v_max3_num_f16 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04]
+v_max3_f16 v5.l, v1.l, v2.l, v3.l
+// GFX12: v_max3_num_f16 v5.l, v1.l, v2.l, v3.l   ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04]
 
 v_med3_f32 v5, v1, v2, v3
 // GFX12: v_med3_num_f32 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x31,0xd6,0x01,0x05,0x0e,0x04]
@@ -24,11 +24,11 @@ v_minmax_f32_e64_dpp v0, -v1, -v2, -v3 dpp8:[0,1,2,3,4,5,6,7]
 v_maxmin_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7]
 // GFX12: v_maxmin_num_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x80,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0xc6,0xfa]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 v_mad_i64_i32 v[5:6], s12, v1, v2, v[3:4]
 // GFX12: v_mad_co_i64_i32 v[5:6], s12, v1, v2, v[3:4] ; encoding: [0x05,0x0c,0xff,0xd6,0x01,0x05,0x0e,0x04]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index b769324..aa804cc 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1775,53 +1775,68 @@ v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 ban
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+
+v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x09,0x13]
+
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2480,53 +2495,53 @@ v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bou
 v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2906,53 +2921,98 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
-v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
 
-v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3515,53 +3575,53 @@ v_med3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x21,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3941,53 +4001,98 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+
+v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -5245,20 +5350,20 @@ v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX12: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX12: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
 // GFX12: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x53,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff]
@@ -5302,20 +5407,20 @@ v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1
 v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 
-v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX12: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
-v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
 // GFX12: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x4d,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff]
@@ -5392,20 +5497,20 @@ v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x
 v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX12: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 
-v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX12: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
-v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
 // GFX12: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x4a,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index f76dd26..e93a65e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -1008,47 +1008,62 @@ v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x54,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x48,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x48,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x48,0xd6,0xea,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1545,47 +1560,47 @@ v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2c,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2c,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1863,47 +1878,86 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x6b,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6b,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2340,47 +2394,47 @@ v_med3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x21,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2b,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2b,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2658,47 +2712,86 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x6a,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6a,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3514,20 +3607,20 @@ v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX12: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x53,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05]
@@ -3571,20 +3664,20 @@ v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
 v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x4d,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05]
@@ -3661,20 +3754,20 @@ v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
 v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX12: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x4a,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index 015619d..d49a708 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -262,50 +262,59 @@ v_clz_i32_u32_e64 v5, src_scc
 v_clz_i32_u32_e64 v255, 0xaf123456
 // GFX12: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cos_f16_e64 v5, v1
-// GFX12: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v1.l
+// GFX12: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, v255
-// GFX12: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v255.l
+// GFX12: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, s1
-// GFX12: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s1
+// GFX12: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, s105
-// GFX12: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s105
+// GFX12: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_lo
-// GFX12: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_lo
+// GFX12: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_hi
-// GFX12: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_hi
+// GFX12: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, ttmp15
-// GFX12: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, ttmp15
+// GFX12: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, m0
-// GFX12: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, m0
+// GFX12: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_lo
-// GFX12: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_lo
+// GFX12: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_hi
-// GFX12: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_hi
+// GFX12: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, null
-// GFX12: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, null
+// GFX12: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, -1
-// GFX12: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, -1
+// GFX12: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, 0.5 mul:2
-// GFX12: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+v_cos_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cos_f16_e64 v5, src_scc mul:4
-// GFX12: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+v_cos_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cos_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16_e64 v5.h, v1.h
+// GFX12: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5.l, v255.h
+// GFX12: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32_e64 v5, v1
 // GFX12: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
@@ -1417,11 +1426,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc|
 v_cvt_i32_f64_e64 v255, 0xaf123456 clamp
 // GFX12: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16_e64 v5, v1
-// GFX12: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v1.l
+// GFX12: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_i32_i16_e64 v5, v255
-// GFX12: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v255.l
+// GFX12: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_i32_i16_e64 v5, s1
 // GFX12: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1462,6 +1471,9 @@ v_cvt_i32_i16_e64 v5, src_scc
 v_cvt_i32_i16_e64 v255, 0xfe0b
 // GFX12: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_cvt_i32_i16_e64 v5, v255.h
+// GFX12: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00]
+
 v_cvt_nearest_i32_f32_e64 v5, v1
 // GFX12: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
 
@@ -1840,11 +1852,11 @@ v_cvt_u32_f64_e64 v5, -|src_scc|
 v_cvt_u32_f64_e64 v255, 0xaf123456 clamp
 // GFX12: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16_e64 v5, v1
-// GFX12: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v1.l
+// GFX12: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_u16_e64 v5, v255
-// GFX12: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v255.l
+// GFX12: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_u32_u16_e64 v5, s1
 // GFX12: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1885,6 +1897,9 @@ v_cvt_u32_u16_e64 v5, src_scc
 v_cvt_u32_u16_e64 v255, 0xfe0b
 // GFX12: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_cvt_u32_u16_e64 v5, v255.h
+// GFX12: v_cvt_u32_u16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
 v_exp_f16_e64 v5, v1
 // GFX12: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2236,50 +2251,59 @@ v_floor_f64_e64 v[5:6], -|src_scc| mul:4
 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX12: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_fract_f16_e64 v5, v1
-// GFX12: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, v1.l
+// GFX12: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5.l, v255.l
+// GFX12: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
-v_fract_f16_e64 v5, v255
-// GFX12: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, s1
+// GFX12: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s1
-// GFX12: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, s105
+// GFX12: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s105
-// GFX12: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_lo
+// GFX12: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_lo
-// GFX12: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_hi
+// GFX12: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_hi
-// GFX12: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, ttmp15
+// GFX12: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, ttmp15
-// GFX12: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, m0
+// GFX12: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, m0
-// GFX12: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_lo
+// GFX12: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_lo
-// GFX12: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_hi
+// GFX12: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_hi
-// GFX12: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, null
+// GFX12: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, null
-// GFX12: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, -1
+// GFX12: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, -1
-// GFX12: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
-v_fract_f16_e64 v5, 0.5 mul:2
-// GFX12: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+v_fract_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
-v_fract_f16_e64 v5, src_scc mul:4
-// GFX12: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_fract_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_fract_f16_e64 v5.h, v1.h
+// GFX12: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5.l, v255.h
+// GFX12: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+v_fract_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32_e64 v5, v1
 // GFX12: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
@@ -2497,50 +2521,59 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc|
 v_frexp_exp_i32_f64_e64 v255, 0xaf123456
 // GFX12: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16_e64 v5, v1
-// GFX12: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, v1.l
+// GFX12: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5.l, v255.l
+// GFX12: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, v255
-// GFX12: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s1
+// GFX12: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s1
-// GFX12: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s105
+// GFX12: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s105
-// GFX12: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_lo
+// GFX12: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_lo
-// GFX12: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_hi
+// GFX12: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_hi
-// GFX12: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, ttmp15
+// GFX12: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, ttmp15
-// GFX12: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, m0
+// GFX12: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, m0
-// GFX12: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_lo
+// GFX12: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_lo
-// GFX12: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_hi
+// GFX12: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_hi
-// GFX12: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, null
+// GFX12: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, null
-// GFX12: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, -1
+// GFX12: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, -1
-// GFX12: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
-v_frexp_mant_f16_e64 v5, 0.5 mul:2
-// GFX12: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+v_frexp_mant_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
-v_frexp_mant_f16_e64 v5, src_scc mul:4
-// GFX12: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16_e64 v5.h, v1.h
+// GFX12: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5.l, v255.h
+// GFX12: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32_e64 v5, v1
 // GFX12: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
@@ -2824,50 +2857,59 @@ v_movrelsd_b32_e64 v255, v255
 v_nop_e64
 // GFX12: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, v1
-// GFX12: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, v1.l
+// GFX12: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5.l, v255.l
+// GFX12: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
-v_not_b16_e64 v5, v255
-// GFX12: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, s1
+// GFX12: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s1
-// GFX12: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, s105
+// GFX12: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s105
-// GFX12: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_lo
+// GFX12: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_lo
-// GFX12: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_hi
+// GFX12: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_hi
-// GFX12: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, ttmp15
+// GFX12: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, ttmp15
-// GFX12: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, m0
+// GFX12: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, m0
-// GFX12: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_lo
+// GFX12: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_lo
-// GFX12: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_hi
+// GFX12: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_hi
-// GFX12: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, null
+// GFX12: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, null
-// GFX12: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, -1
+// GFX12: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, -1
-// GFX12: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, 0.5
+// GFX12: v_not_b16_e64 v5.l, 0.5                 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, 0.5
-// GFX12: v_not_b16_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, src_scc
+// GFX12: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, src_scc
-// GFX12: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16_e64 v255.l, 0xfe0b
+// GFX12: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_not_b16_e64 v255, 0xfe0b
-// GFX12: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16_e64 v5.h, v1.h
+// GFX12: v_not_b16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5.l, v255.h
+// GFX12: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+v_not_b16_e64 v255.h, 0xfe0b
+// GFX12: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_not_b32_e64 v5, v1
 // GFX12: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
@@ -3088,50 +3130,59 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX12: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX12: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX12: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.l
+// GFX12: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX12: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX12: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX12: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX12: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX12: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX12: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX12: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX12: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX12: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX12: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX12: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX12: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX12: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX12: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX12: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX12: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX12: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX12: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX12: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX12: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX12: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX12: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX12: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v5.h, v1.h
+// GFX12: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.h
+// GFX12: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32_e64 v5, v1
 // GFX12: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
@@ -3385,50 +3436,62 @@ v_sat_pk_u8_i16_e64 v5, src_scc
 v_sat_pk_u8_i16_e64 v255, 0xfe0b
 // GFX12: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sin_f16_e64 v5, v1
-// GFX12: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sat_pk_u8_i16_e64 v255.h, 0xfe0b
+// GFX12: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16_e64 v5.l, v1.l
+// GFX12: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, v255
-// GFX12: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+v_sin_f16_e64 v5.l, v255.l
+// GFX12: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, s1
-// GFX12: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s1
+// GFX12: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, s105
-// GFX12: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s105
+// GFX12: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_lo
-// GFX12: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_lo
+// GFX12: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_hi
-// GFX12: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_hi
+// GFX12: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, ttmp15
-// GFX12: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, ttmp15
+// GFX12: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, m0
-// GFX12: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, m0
+// GFX12: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_lo
-// GFX12: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_lo
+// GFX12: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_hi
-// GFX12: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_hi
+// GFX12: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, null
-// GFX12: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, null
+// GFX12: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, -1
-// GFX12: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, -1
+// GFX12: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, 0.5 mul:2
-// GFX12: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+v_sin_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
-v_sin_f16_e64 v5, src_scc mul:4
-// GFX12: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+v_sin_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
-v_sin_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16_e64 v5.h, v1.h
+// GFX12: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5.l, v255.h
+// GFX12: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32_e64 v5, v1
 // GFX12: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
@@ -3601,50 +3664,59 @@ v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4
 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX12: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_trunc_f16_e64 v5, v1
-// GFX12: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, v1.l
+// GFX12: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, v255.l
+// GFX12: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, s1
+// GFX12: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, s105
+// GFX12: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, v255
-// GFX12: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_lo
+// GFX12: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s1
-// GFX12: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_hi
+// GFX12: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s105
-// GFX12: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, ttmp15
+// GFX12: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_lo
-// GFX12: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, m0
+// GFX12: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_hi
-// GFX12: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_lo
+// GFX12: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, ttmp15
-// GFX12: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_hi
+// GFX12: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, m0
-// GFX12: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, null
+// GFX12: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_lo
-// GFX12: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, -1
+// GFX12: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_hi
-// GFX12: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
-v_trunc_f16_e64 v5, null
-// GFX12: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
-v_trunc_f16_e64 v5, -1
-// GFX12: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_trunc_f16_e64 v5, 0.5 mul:2
-// GFX12: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+v_trunc_f16_e64 v5.h, v1.h
+// GFX12: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v5, src_scc mul:4
-// GFX12: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+v_trunc_f16_e64 v5.l, v255.h
+// GFX12: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_trunc_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32_e64 v5, v1
 // GFX12: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 160bc3f..89102ae 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -211,47 +211,56 @@ v_clz_i32_u32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_cos_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_cos_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1000,47 +1009,50 @@ v_cvt_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_mirror
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+
+v_cvt_i32_i16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1363,47 +1375,50 @@ v_cvt_u32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_mirror
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_cvt_u32_u16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1699,47 +1714,56 @@ v_floor_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_fract_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_fract_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1876,47 +1900,56 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2128,47 +2161,56 @@ v_movrels_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_mirror
-// GFX12: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_not_b16_e64_dpp v5.h, v1.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+
+v_not_b16_e64_dpp v5.l, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+
+v_not_b16_e64_dpp v255.h, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2338,47 +2380,56 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2548,47 +2599,59 @@ v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl
 v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+
+v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_sin_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_sin_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2716,47 +2779,56 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_trunc_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_trunc_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index c9ea7cd..1b1a91f 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -52,17 +52,26 @@ v_clz_i32_u32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xb9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_cos_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_cos_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -316,14 +325,17 @@ v_cvt_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x88,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -418,14 +430,17 @@ v_cvt_u32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x87,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_cvt_u32_u16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x08,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -502,17 +517,26 @@ v_floor_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xa4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_fract_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -553,17 +577,26 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0xbf,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -619,14 +652,23 @@ v_movrels_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xc3,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_not_b16_e64_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v255.h, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -673,17 +715,26 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xab,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -730,17 +781,29 @@ v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_sin_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_sin_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -778,17 +841,26 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xb3,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AsmParser/macro-def-in-instantiation.s b/llvm/test/MC/AsmParser/macro-def-in-instantiation.s
index b6df62e..96ddde82 100644
--- a/llvm/test/MC/AsmParser/macro-def-in-instantiation.s
+++ b/llvm/test/MC/AsmParser/macro-def-in-instantiation.s
@@ -9,22 +9,3 @@
 make_macro .macro,mybyte,a,.byte,\a,.endm
 # CHECK: .byte 42
 mybyte 42
-
-# PR18599
-.macro macro_a
- .macro macro_b
-  .byte 10
-  .macro macro_c
-  .endm
-
-  macro_c
-  .purgem macro_c
- .endm
-
- macro_b
-.endm
-
-# CHECK: .byte 10
-# CHECK: .byte 10
-macro_a
-macro_b
diff --git a/llvm/test/MC/AsmParser/macro-nesting.s b/llvm/test/MC/AsmParser/macro-nesting.s
new file mode 100644
index 0000000..37561a6
--- /dev/null
+++ b/llvm/test/MC/AsmParser/macro-nesting.s
@@ -0,0 +1,45 @@
+# RUN: llvm-mc -triple=x86_64 %s | FileCheck %s
+
+# CHECK:              .globl  inside_at_0
+# CHECK-NEXT:         .globl  inside_plus_0
+# CHECK-NEXT: inside_at_0:
+# CHECK-NEXT: inside_plus_0:
+# CHECK-NEXT:         .globl  after_at_0
+# CHECK-NEXT:         .globl  after_plus_0
+# CHECK-NEXT: after_at_0:
+# CHECK-NEXT: after_plus_0:
+
+.macro outer1
+  .macro inner1
+    .globl inside_at_\@
+    .globl inside_plus_\+
+    inside_at_\@:
+    inside_plus_\+:
+  .endm
+  inner1
+  .globl after_at_\@
+  .globl after_plus_\+
+  after_at_\@:
+  after_plus_\+:
+.endm
+
+outer1
+
+# PR18599
+.macro macro_a
+ .macro macro_b
+  .byte 10
+  .macro macro_c
+  .endm
+
+  macro_c
+  .purgem macro_c
+ .endm
+
+ macro_b
+.endm
+
+# CHECK: .byte 10
+# CHECK: .byte 10
+macro_a
+macro_b
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
index c5d074b..d198771 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
@@ -1,10 +1,18 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mc -triple aarch64 -disassemble %s  | FileCheck %s
 
-[0x1f,0x70,0x0e,0xd5]
+[0x00,0x70,0x0e,0xd5]
+[0x01,0x70,0x0e,0xd5]
+[0x02,0x70,0x0e,0xd5]
+[0x11,0x70,0x0e,0xd5]
+[0x1e,0x70,0x0e,0xd5]
 [0xa3,0x21,0x3e,0xd5]
 [0xa4,0x21,0x1e,0xd5]
 
-# CHECK:      	sys	#6, c7, c0, #0
+# CHECK:      	sys	#6, c7, c0, #0, x0
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x1
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x2
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x17
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x30
 # CHECK-NEXT: 	mrs	x3, GPCBW_EL3
 # CHECK-NEXT: 	msr	GPCBW_EL3, x4
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt
index 890a64b..9530167 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt
@@ -1261,3 +1261,18 @@
 
 # GFX10: s_store_dwordx4 s[96:99], s[4:5], s0    ; encoding: [0x02,0x18,0x48,0xf4,0x00,0x00,0x00,0x00]
 0x02,0x18,0x48,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dword null, s[2:3], s0           ; encoding: [0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dwordx2 null, s[2:3], s0         ; encoding: [0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dwordx4 null, s[2:3], s0         ; encoding: [0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dwordx8 null, s[2:3], s0         ; encoding: [0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dwordx16 null, s[2:3], s0        ; encoding: [0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
index 8b49de5..8396132 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
@@ -471,3 +471,18 @@
 
 # GFX11: s_gl1_inv                               ; encoding: [0x00,0x00,0x80,0xf4,0x00,0x00,0x00,0x00]
 0x00,0x00,0x80,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b32 null, s[2:3], s0             ; encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b64 null, s[2:3], s0             ; encoding: [0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b128 null, s[2:3], s0            ; encoding: [0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b256 null, s[2:3], s0            ; encoding: [0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b512 null, s[2:3], s0            ; encoding: [0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
index 61e529a..57a1da6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
@@ -287,49 +287,82 @@
 # GFX11: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xc3,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xc3,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
 
 0x7f,0xc3,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc3,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
 0x01,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
 
 0x69,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, s105                ; encoding: [0x69,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
 
 0x6a,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
 
 0x6b,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
 
 0x7b,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
 
 0x7d,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, m0                  ; encoding: [0x7d,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
 
 0x7e,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
 
 0x7f,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
 
 0x7c,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, null                ; encoding: [0x7c,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
 
 0xc1,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
 
 0xf0,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
 
 0xfd,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
 0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xc3,0x0a,0x7e
+# GFX11-REAL16: v_cos_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc3,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xc3,0x0a,0x7e]
+
+0xff,0xc3,0x0a,0x7e
+# GFX11-REAL16: v_cos_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc3,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xc3,0x0a,0x7e]
+
+0xf0,0xc2,0xfe,0x7e
+# GFX11-REAL16: v_cos_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc2,0xfe,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v127, 0.5                 ; encoding: [0xf0,0xc2,0xfe,0x7e]
+
+0xfd,0xc2,0x0a,0x7f
+# GFX11-REAL16: v_cos_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7f]
+
+0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x6d,0x0a,0x7e
 # GFX11: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
@@ -1300,10 +1333,12 @@
 # GFX11: v_cvt_i32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xd5,0x0a,0x7e
-# GFX11: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
+# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v1.l              ; encoding: [0x01,0xd5,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
 
 0x7f,0xd5,0x0a,0x7e
-# GFX11: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
+# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v127.l            ; encoding: [0x7f,0xd5,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
 0x01,0xd4,0x0a,0x7e
 # GFX11: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
@@ -1344,6 +1379,15 @@
 0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00
 # GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+0x81,0xd5,0x0a,0x7e
+# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v1.h              ; encoding: [0x81,0xd5,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd5,0x0a,0x7e]
+
+0xff,0xd5,0x0a,0x7e
+# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v127.h            ; encoding: [0xff,0xd5,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd5,0x0a,0x7e]
+
+
 0x01,0x19,0x0a,0x7e
 # GFX11: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
 
@@ -1714,10 +1758,12 @@
 # GFX11: v_cvt_u32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xd7,0x0a,0x7e
-# GFX11: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
+# GFX11-REAL16: v_cvt_u32_u16_e32 v5, v1.l              ; encoding: [0x01,0xd7,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
 
 0x7f,0xd7,0x0a,0x7e
-# GFX11: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
+# GFX11-REAL16: v_cvt_u32_u16_e32 v5, v127.l            ; encoding: [0x7f,0xd7,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
 0x01,0xd6,0x0a,0x7e
 # GFX11: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
@@ -1758,6 +1804,15 @@
 0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00
 # GFX11: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+0x81,0xd7,0x0a,0x7e
+# GFX11-REAL16: v_cvt_u32_u16_e32 v5, v1.h              ; encoding: [0x81,0xd7,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_u32_u16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd7,0x0a,0x7e]
+
+0xff,0xd7,0x0a,0x7e
+# GFX11-REAL16: v_cvt_u32_u16_e32 v5, v127.h            ; encoding: [0xff,0xd7,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_u32_u16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd7,0x0a,0x7e]
+
+
 0x01,0xb1,0x0a,0x7e
 # GFX11-REAL16: v_exp_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xb1,0x0a,0x7e]
 # GFX11-FAKE16: v_exp_f16_e32 v5, v1                    ; encoding: [0x01,0xb1,0x0a,0x7e]
@@ -2017,49 +2072,82 @@
 # GFX11: v_floor_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xbf,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbf,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
 
 0x7f,0xbf,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbf,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
 
 0x01,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, s1                ; encoding: [0x01,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
 
 0x69,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, s105              ; encoding: [0x69,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
 
 0x6a,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
 
 0x6b,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
 
 0x7b,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
 
 0x7d,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
 
 0x7e,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
 
 0x7f,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
 
 0x7c,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, null              ; encoding: [0x7c,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
 
 0xc1,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
 
 0xf0,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
 
 0xfd,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
 0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xbf,0x0a,0x7e
+# GFX11-REAL16: v_fract_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbf,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbf,0x0a,0x7e]
+
+0xff,0xbf,0x0a,0x7e
+# GFX11-REAL16: v_fract_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbf,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbf,0x0a,0x7e]
+
+0xf0,0xbe,0xfe,0x7e
+# GFX11-REAL16: v_fract_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbe,0xfe,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v127, 0.5               ; encoding: [0xf0,0xbe,0xfe,0x7e]
+
+0xfd,0xbe,0x0a,0x7f
+# GFX11-REAL16: v_fract_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7f]
+
+0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x41,0x0a,0x7e
 # GFX11: v_fract_f32_e32 v5, v1                  ; encoding: [0x01,0x41,0x0a,0x7e]
@@ -2290,49 +2378,82 @@
 # GFX11: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xb3,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v1.l         ; encoding: [0x01,0xb3,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
 
 0x7f,0xb3,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v127.l       ; encoding: [0x7f,0xb3,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
 
 0x01,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, s1           ; encoding: [0x01,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
 
 0x69,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, s105         ; encoding: [0x69,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
 
 0x6a,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, vcc_lo       ; encoding: [0x6a,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
 
 0x6b,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, vcc_hi       ; encoding: [0x6b,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
 
 0x7b,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, ttmp15       ; encoding: [0x7b,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
 
 0x7d,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, m0           ; encoding: [0x7d,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
 
 0x7e,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, exec_lo      ; encoding: [0x7e,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
 
 0x7f,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, exec_hi      ; encoding: [0x7f,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
 
 0x7c,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, null         ; encoding: [0x7c,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
 
 0xc1,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, -1           ; encoding: [0xc1,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
 
 0xf0,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, 0.5          ; encoding: [0xf0,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
 
 0xfd,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
 0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v127.l, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xb3,0x0a,0x7e
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v1.h         ; encoding: [0x81,0xb3,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xb3,0x0a,0x7e]
+
+0xff,0xb3,0x0a,0x7e
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v127.h       ; encoding: [0xff,0xb3,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xb3,0x0a,0x7e]
+
+0xf0,0xb2,0xfe,0x7e
+# GFX11-REAL16: v_frexp_mant_f16_e32 v127.l, 0.5        ; encoding: [0xf0,0xb2,0xfe,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v127, 0.5          ; encoding: [0xf0,0xb2,0xfe,0x7e]
+
+0xfd,0xb2,0x0a,0x7f
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.h, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7f]
+
+0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e32 v127.h, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x81,0x0a,0x7e
 # GFX11: v_frexp_mant_f32_e32 v5, v1             ; encoding: [0x01,0x81,0x0a,0x7e]
@@ -2638,49 +2759,82 @@
 # GFX11: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
 
 0x01,0xd3,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, v1.l                ; encoding: [0x01,0xd3,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
 
 0x7f,0xd3,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, v127.l              ; encoding: [0x7f,0xd3,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
 
 0x01,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, s1                  ; encoding: [0x01,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
 
 0x69,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, s105                ; encoding: [0x69,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
 
 0x6a,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
 
 0x6b,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
 
 0x7b,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
 
 0x7d,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, m0                  ; encoding: [0x7d,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
 
 0x7e,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
 
 0x7f,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
 
 0x7c,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, null                ; encoding: [0x7c,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
 
 0xc1,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, -1                  ; encoding: [0xc1,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
 
 0xf0,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, 0x3800                ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e32 v5.l, 0x3800              ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e32 v5, 0x3800                ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
 0xfd,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
 0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xd3,0x0a,0x7e
+# GFX11-REAL16: v_not_b16_e32 v5.l, v1.h                ; encoding: [0x81,0xd3,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd3,0x0a,0x7e]
+
+0xff,0xd3,0x0a,0x7e
+# GFX11-REAL16: v_not_b16_e32 v5.l, v127.h              ; encoding: [0xff,0xd3,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd3,0x0a,0x7e]
+
+0xf0,0xd2,0xfe,0x7e
+# GFX11-REAL16: v_not_b16_e32 v127.l, 0x3800            ; encoding: [0xff,0xd2,0xfe,0x7e,0x00,0x38,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e32 v127, 0x3800              ; encoding: [0xff,0xd2,0xfe,0x7e,0x00,0x38,0x00,0x00]
+
+0xfd,0xd2,0x0a,0x7f
+# GFX11-REAL16: v_not_b16_e32 v5.h, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7f]
+
+0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_not_b16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x6f,0x0a,0x7e
 # GFX11: v_not_b32_e32 v5, v1                    ; encoding: [0x01,0x6f,0x0a,0x7e]
@@ -2947,49 +3101,82 @@
 # GFX11: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
 0x01,0xbd,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
 
 0x7f,0xbd,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
 0x01,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
 
 0x69,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, s105              ; encoding: [0x69,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
 
 0x6a,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
 0x6b,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
 0x7b,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
 0x7d,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
 0x7e,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
 0x7f,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
 0x7c,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, null              ; encoding: [0x7c,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
 0xc1,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
 0xf0,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
 0xfd,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
 0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xbd,0x0a,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+0xff,0xbd,0x0a,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+0xf0,0xbc,0xfe,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v127, 0.5               ; encoding: [0xf0,0xbc,0xfe,0x7e]
+
+0xfd,0xbc,0x0a,0x7f
+# GFX11-REAL16: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x47,0x0a,0x7e
 # GFX11: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
@@ -3220,94 +3407,152 @@
 # GFX11: v_rsq_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xc5,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, v1            ; encoding: [0x01,0xc5,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
 
 0xff,0xc5,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, v255          ; encoding: [0xff,0xc5,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
 
 0x01,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, s1            ; encoding: [0x01,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
 
 0x69,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, s105          ; encoding: [0x69,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
 
 0x6a,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, vcc_lo        ; encoding: [0x6a,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
 
 0x6b,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, vcc_hi        ; encoding: [0x6b,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
 
 0x7b,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, ttmp15        ; encoding: [0x7b,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
 
 0x7d,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, m0            ; encoding: [0x7d,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
 
 0x7e,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, exec_lo       ; encoding: [0x7e,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
 
 0x7f,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, exec_hi       ; encoding: [0x7f,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
 
 0x7c,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, null          ; encoding: [0x7c,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
 
 0xc1,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, -1            ; encoding: [0xc1,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
 
 0xf0,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, 0.5           ; encoding: [0xf0,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
 
 0xfd,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, src_scc       ; encoding: [0xfd,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
 
 0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b      ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0xf0,0xc4,0xfe,0x7e
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.l, 0.5         ; encoding: [0xf0,0xc4,0xfe,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v127, 0.5           ; encoding: [0xf0,0xc4,0xfe,0x7e]
+
+0xfd,0xc4,0x0a,0x7f
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.h, src_scc       ; encoding: [0xfd,0xc4,0x0a,0x7f]
+
+0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b      ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0xc1,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xc1,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
 
 0x7f,0xc1,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc1,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
 0x01,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
 
 0x69,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, s105                ; encoding: [0x69,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
 
 0x6a,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
 
 0x6b,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
 
 0x7b,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
 
 0x7d,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, m0                  ; encoding: [0x7d,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
 
 0x7e,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
 
 0x7f,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
 
 0x7c,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, null                ; encoding: [0x7c,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
 
 0xc1,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
 
 0xf0,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
 
 0xfd,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
 0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xc1,0x0a,0x7e
+# GFX11-REAL16: v_sin_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc1,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xc1,0x0a,0x7e]
+
+0xff,0xc1,0x0a,0x7e
+# GFX11-REAL16: v_sin_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc1,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xc1,0x0a,0x7e]
+
+0xf0,0xc0,0xfe,0x7e
+# GFX11-REAL16: v_sin_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc0,0xfe,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v127, 0.5                 ; encoding: [0xf0,0xc0,0xfe,0x7e]
+
+0xfd,0xc0,0x0a,0x7f
+# GFX11-REAL16: v_sin_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7f]
+
+0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x6b,0x0a,0x7e
 # GFX11: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
@@ -3520,49 +3765,82 @@
 # GFX11: v_swaprel_b32 v255, v255                ; encoding: [0xff,0xd1,0xfe,0x7f]
 
 0x01,0xbb,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbb,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
 
 0x7f,0xbb,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbb,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
 
 0x01,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, s1                ; encoding: [0x01,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
 
 0x69,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, s105              ; encoding: [0x69,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
 
 0x6a,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
 
 0x6b,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
 
 0x7b,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
 
 0x7d,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, m0                ; encoding: [0x7d,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
 
 0x7e,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
 
 0x7f,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
 
 0x7c,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, null              ; encoding: [0x7c,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
 
 0xc1,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, -1                ; encoding: [0xc1,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
 
 0xf0,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
 
 0xfd,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
 
 0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xbb,0x0a,0x7e
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbb,0x0a,0x7e]
+
+0xff,0xbb,0x0a,0x7e
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbb,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbb,0x0a,0x7e]
+
+0xf0,0xba,0xfe,0x7e
+# GFX11-REAL16: v_trunc_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xba,0xfe,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v127, 0.5               ; encoding: [0xf0,0xba,0xfe,0x7e]
+
+0xfd,0xba,0x0a,0x7f
+# GFX11-REAL16: v_trunc_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7f]
+
+0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x43,0x0a,0x7e
 # GFX11: v_trunc_f32_e32 v5, v1                  ; encoding: [0x01,0x43,0x0a,0x7e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
index 1075a3e..cabae81 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
@@ -229,46 +229,72 @@
 # GFX11: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cos_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -971,46 +997,72 @@
 # GFX11: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
-# GFX11: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13]
+
+0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1307,46 +1359,72 @@
 # GFX11: v_cvt_u32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
-# GFX11: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd6,0x0a,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13]
+
+0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cvt_u32_u16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11-REAL16: v_exp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1545,46 +1623,72 @@
 # GFX11: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fract_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1727,46 +1831,72 @@
 # GFX11: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2119,46 +2249,72 @@
 # GFX11: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30
-# GFX11: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_not_b16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
 
 0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2343,46 +2499,72 @@
 # GFX11: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2525,88 +2707,140 @@
 # GFX11: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30
-# GFX11: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+
+0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, s1, v176        ; encoding: [0x01,0x60,0x01,0x13]
+
+0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sin_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2749,46 +2983,72 @@
 # GFX11: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
index 051dd34..fc7cbba 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
@@ -43,10 +43,23 @@
 # GFX11: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -185,10 +198,24 @@
 # GFX11: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00
-# GFX11: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -254,10 +281,24 @@
 # GFX11: v_cvt_u32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00
-# GFX11: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd6,0x0a,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cvt_u32_u16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11-REAL16: v_exp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -302,10 +343,23 @@
 # GFX11: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -335,14 +389,33 @@
 # GFX11: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
+# GFX11-FAKE16-REAL16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
+# GFX11-FAKE16-REAL16: v_frexp_mant_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
+0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
 0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00
 # GFX11: v_frexp_mant_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
@@ -398,10 +471,23 @@
 # GFX11: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -437,10 +523,23 @@
 # GFX11: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -470,16 +569,42 @@
 # GFX11: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+
+0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v1, v187        ; encoding: [0x01,0x77,0x39,0x05]
+
+0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -509,10 +634,23 @@
 # GFX11: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index 4990c62..adcca58 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -2402,53 +2402,125 @@
 # GFX11: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00
-# GFX11: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01
-# GFX11: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01
-# GFX11: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01
-# GFX11: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX11: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX11: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX11: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX11: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01
-# GFX11: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX11: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX11: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX11: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX11: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23
-# GFX11: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
 
 0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 # CHECK: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2    ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_fma_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_fma_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_fma_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_fma_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_fma_f32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
@@ -4289,49 +4361,118 @@
 # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00
-# GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01
-# GFX11: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01
-# GFX11: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01
-# GFX11: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX11: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01
-# GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33
-# GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_maxmin_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00]
@@ -5779,49 +5920,118 @@
 # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00
-# GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01
-# GFX11: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01
-# GFX11: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01
-# GFX11: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX11: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01
-# GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33
-# GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_minmax_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index d734cd2..2964360 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -2113,46 +2113,118 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2833,46 +2905,118 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4431,46 +4575,130 @@
 # W64-FAKE16: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13
-# GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
 0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
index 2666b75..282ff229 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
@@ -239,46 +239,72 @@
 # GFX11: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1025,46 +1051,64 @@
 # GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1397,46 +1441,64 @@
 # GFX11: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11-REAL16: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1659,46 +1721,72 @@
 # GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1853,46 +1941,72 @@
 # GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2257,46 +2371,72 @@
 # GFX11: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+
+0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2493,46 +2633,72 @@
 # GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2687,88 +2853,132 @@
 # GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2923,46 +3133,72 @@
 # GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index 3b3d398..7a81ba2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -1141,40 +1141,106 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1585,40 +1651,106 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2547,46 +2679,130 @@
 # W64-FAKE16: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x78,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # W32-REAL16: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
index c19947c..5995762 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
@@ -61,16 +61,32 @@
 # GFX11: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -291,10 +307,16 @@
 # GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -381,10 +403,16 @@
 # GFX11: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x08,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11-REAL16: v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -455,16 +483,32 @@
 # GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -505,16 +549,32 @@
 # GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -599,10 +659,24 @@
 # GFX11: v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -663,16 +737,32 @@
 # GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -727,22 +817,44 @@
 # GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -797,16 +909,32 @@
 # GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
index 3df206c..d7e7390 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
@@ -281,49 +281,76 @@
 # GFX11: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
@@ -1314,10 +1341,12 @@
 # GFX11: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00
 # GFX11: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1358,6 +1387,10 @@
 0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX11: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+
 0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
 
@@ -1746,10 +1779,12 @@
 # GFX11: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00
 # GFX11: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1790,6 +1825,10 @@
 0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX11: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_cvt_u32_u16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
 0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00
 # GFX11-REAL16: v_exp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 # GFX11-FAKE16: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
@@ -2037,49 +2076,76 @@
 # GFX11: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
@@ -2316,49 +2382,76 @@
 # GFX11: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
@@ -2658,49 +2751,76 @@
 # GFX11: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, 0x3800              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
 0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_not_b16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
@@ -2937,49 +3057,76 @@
 # GFX11: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
@@ -3204,94 +3351,140 @@
 # GFX11: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255          ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105          ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo        ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi        ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15        ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0            ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo       ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi       ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, null          ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1            ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5           ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc       ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
 
 0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b      ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
@@ -3480,49 +3673,76 @@
 # GFX11: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
index 28decdd..02641e6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
@@ -1277,3 +1277,21 @@
 
 # GFX12: s_buffer_load_u16 s5, s[96:99], s0 offset:0x0 th:TH_LOAD_HT scope:SCOPE_SYS ; encoding:  [0x70,0x61,0x63,0xf5,0x00,0x00,0x00,0x00]
 0x70,0x61,0x63,0xf5,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b32 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b64 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b96 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b128 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b256 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b512 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
index a1291b2..181b78f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
@@ -237,46 +237,68 @@
 # GFX12: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1035,46 +1057,68 @@
 # GFX12: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
-# GFX12: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13]
+
+0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1395,46 +1439,68 @@
 # GFX12: v_cvt_u32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
-# GFX12: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13]
+
+0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cvt_u32_u16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1649,46 +1715,68 @@
 # GFX12: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1839,46 +1927,68 @@
 # GFX12: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2239,46 +2349,68 @@
 # GFX12: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30
-# GFX12: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
 
 0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2471,46 +2603,68 @@
 # GFX12: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2661,88 +2815,132 @@
 # GFX12: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30
-# GFX12: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+
+0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, s1, v176        ; encoding: [0x01,0x60,0x01,0x13]
+
+0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2893,46 +3091,68 @@
 # GFX12: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
index 05008bf..7f9b268 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
@@ -44,10 +44,19 @@
 # GFX12: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -198,10 +207,20 @@
 # GFX12: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00
-# GFX12: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -267,10 +286,20 @@
 # GFX12: v_cvt_u32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00
-# GFX12: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cvt_u32_u16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -315,10 +344,19 @@
 # GFX12: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -348,10 +386,19 @@
 # GFX12: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -411,10 +458,20 @@
 # GFX12: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
 
 0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -450,10 +507,19 @@
 # GFX12: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -483,16 +549,34 @@
 # GFX12: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+
+0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[1:2], v[187:188] ; encoding: [0x01,0x77,0x39,0x05]
+
+0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -522,10 +606,19 @@
 # GFX12: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 4c2060a..633d3a4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -2377,49 +2377,118 @@
 # GFX12: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
 
 0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX12: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_fma_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_fma_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_fma_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_fma_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_fma_f32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
@@ -3509,49 +3578,100 @@
 # GFX12: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xfe,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_max3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_max3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_max3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_max3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_max3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_max3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_max3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_max3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_max3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_max3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_max3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_max3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_max3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_max3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
 
 0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX12: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_max3_num_f32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00]
@@ -4145,49 +4265,120 @@
 # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX12: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33
-# GFX12: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+
+
+0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+
+
+0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_maxmin_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00]
@@ -4886,49 +5077,100 @@
 # GFX12: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_min3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_min3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_min3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_min3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_min3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_min3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_min3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_min3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_min3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_min3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_min3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_min3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_min3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_min3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
 
 0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX12: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_min3_num_f32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00]
@@ -5522,49 +5764,120 @@
 # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX12: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33
-# GFX12: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+
+
+0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+
+
+0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_minmax_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index c64fe39..7e30a4a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -2329,52 +2329,131 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+
+0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3082,49 +3161,125 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+
+0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4764,49 +4919,124 @@
 # W64-FAKE16: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13
-# GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
 0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -5199,49 +5429,119 @@
 # GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
 0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+
 
 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -5964,49 +6264,119 @@
 # W64-FAKE16: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
 0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+
 
 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 9ed20c7..2aaba2a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -1294,43 +1294,112 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1768,43 +1837,112 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2814,52 +2952,130 @@
 # W64-FAKE16: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, 4.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, 4.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, 4.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, 4.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, 4.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # W32-REAL16: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3240,49 +3456,119 @@
 # GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
 
 0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3981,49 +4267,119 @@
 # W64-FAKE16: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
 
 0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index bb9f607..25c4e4a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -279,49 +279,76 @@
 # GFX12: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
@@ -1366,10 +1393,12 @@
 # GFX12: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00
 # GFX12: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1410,6 +1439,10 @@
 0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+
 0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
 
@@ -1798,10 +1831,12 @@
 # GFX12: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00
 # GFX12: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1842,6 +1877,10 @@
 0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_cvt_u32_u16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
 0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00
 # GFX12-REAL16: v_exp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 # GFX12-FAKE16: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
@@ -2089,50 +2128,78 @@
 # GFX12: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
+0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+# GFX11: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2368,50 +2435,76 @@
 # GFX12: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
-
+# GFX12-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+# GFX11: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2707,49 +2800,76 @@
 # GFX12: v_movrelsd_b32_e64 v255, v255           ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, 0x3800              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
 0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX12: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_not_b16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
@@ -2983,49 +3103,76 @@
 # GFX12: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
@@ -3250,94 +3397,140 @@
 # GFX12: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255          ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105          ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo        ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi        ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15        ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0            ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo       ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi       ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, null          ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1            ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5           ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc       ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
 
 0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b      ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
@@ -3526,49 +3719,76 @@
 # GFX12: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index be9f069..f447fb4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -227,46 +227,72 @@
 # GFX12: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1055,46 +1081,64 @@
 # GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1427,46 +1471,64 @@
 # GFX12: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1665,47 +1727,74 @@
 # GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
+0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
@@ -1859,47 +1948,72 @@
 # GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
-
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
@@ -2125,46 +2239,72 @@
 # GFX12: v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+
+0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2349,46 +2489,72 @@
 # GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2531,88 +2697,132 @@
 # GFX12: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2755,46 +2965,72 @@
 # GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index 87115b9..7cf415a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -49,16 +49,32 @@
 # GFX12: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -321,10 +337,16 @@
 # GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -411,10 +433,16 @@
 # GFX12: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x08,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -473,17 +501,34 @@
 # GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
+0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
@@ -523,17 +568,32 @@
 # GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
@@ -587,10 +647,24 @@
 # GFX12: v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -639,16 +713,32 @@
 # GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -691,22 +781,44 @@
 # GFX12: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -749,16 +861,32 @@
 # GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/Xtensa/code_density.txt b/llvm/test/MC/Disassembler/Xtensa/code_density.txt
index eac236a..833dd52 100644
--- a/llvm/test/MC/Disassembler/Xtensa/code_density.txt
+++ b/llvm/test/MC/Disassembler/Xtensa/code_density.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble %s | FileCheck --check-prefix=CHECK-DENSITY %s
+# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble %s | FileCheck -check-prefixes=CHECK-DENSITY %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
 
 #------------------------------------------------------------------------------
 # Verify that binary code is correctly disassembled with
@@ -6,57 +7,46 @@
 # density option generates warnings.
 #------------------------------------------------------------------------------
 
-0x4a 0x23
+[0x4a, 0x23]
 # CHECK-DENSITY: add.n  a2, a3, a4
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x3b 0x23
+[0x3b, 0x23]
 # CHECK-DENSITY: addi.n a2, a3, 3
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x9c 0x03
+[0x9c, 0x03]
 # CHECK-DENSITY: beqz.n a3, . +20
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0xcc 0xe3
+[0xcc, 0xe3]
 # CHECK-DENSITY: bnez.n a3, . +18
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x6d 0xf0
+[0x6d, 0xf0]
 # CHECK-DENSITY: ill.n
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x28 0x33
+[0x28, 0x33]
 # CHECK-DENSITY: l32i.n a2, a3, 12
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x2d 0x03
+[0x2d, 0x03]
 # CHECK-DENSITY: mov.n  a2, a3
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x0d 0xf0
+[0x0d, 0xf0]
 # CHECK-DENSITY: ret.n
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x29 0x33
+[0x29, 0x33]
 # CHECK-DENSITY: s32i.n a2, a3, 12
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x6c 0x02
+[0x6c, 0x02]
 # CHECK-DENSITY: movi.n a2, -32
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x3d 0xf0
+[0x3d, 0xf0]
 # CHECK-DENSITY: nop.n
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/RISCV/XVentanaCondOps-valid.s b/llvm/test/MC/RISCV/XVentanaCondOps-valid.s
index 8f4eba5..9825210 100644
--- a/llvm/test/MC/RISCV/XVentanaCondOps-valid.s
+++ b/llvm/test/MC/RISCV/XVentanaCondOps-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+xventanacondops -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xventanacondops -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+xventanacondops < %s \
 # RUN:     | llvm-objdump --mattr=+xventanacondops -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/Zawrs-valid.s b/llvm/test/MC/RISCV/Zawrs-valid.s
index 0bdc570..29e95d3 100644
--- a/llvm/test/MC/RISCV/Zawrs-valid.s
+++ b/llvm/test/MC/RISCV/Zawrs-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zawrs -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zawrs -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zawrs -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zawrs -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zawrs < %s \
 # RUN:     | llvm-objdump --mattr=+zawrs -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/Ztso.s b/llvm/test/MC/RISCV/Ztso.s
index 06b1030..af61355 100644
--- a/llvm/test/MC/RISCV/Ztso.s
+++ b/llvm/test/MC/RISCV/Ztso.s
@@ -1,5 +1,5 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+ztso -riscv-no-aliases 2>&1 | FileCheck %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+ztso -riscv-no-aliases 2>&1 | FileCheck %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+ztso -M no-aliases 2>&1 | FileCheck %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+ztso -M no-aliases 2>&1 | FileCheck %s
 
 # Note: Ztso doesn't add or remove any instructions, so this is basically
 # just checking that a) we accepted the attribute name, and b) codegen did
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 6ffaa62..4e77a53 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -467,3 +467,9 @@
 
 .attribute arch, "rv32i_ssctr1p0"
 # CHECK: attribute      5, "rv32i2p1_sscsrind1p0_ssctr1p0"
+
+.attribute arch, "rv32i_sdext1p0"
+# CHECK: attribute      5, "rv32i2p1_sdext1p0"
+
+.attribute arch, "rv32i_sdtrig1p0"
+# CHECK: attribute      5, "rv32i2p1_sdtrig1p0"
diff --git a/llvm/test/MC/RISCV/compress-cjal.s b/llvm/test/MC/RISCV/compress-cjal.s
index d55586b..cdb6e85 100644
--- a/llvm/test/MC/RISCV/compress-cjal.s
+++ b/llvm/test/MC/RISCV/compress-cjal.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIASOBJ %s
diff --git a/llvm/test/MC/RISCV/compress-debug-info.s b/llvm/test/MC/RISCV/compress-debug-info.s
index 70aaefb..134c3d0 100644
--- a/llvm/test/MC/RISCV/compress-debug-info.s
+++ b/llvm/test/MC/RISCV/compress-debug-info.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -triple riscv32 -mattr=+c %s -g -o - -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv32 -mattr=+c %s -g -o - -M no-aliases \
 # RUN:   | FileCheck %s -check-prefixes=COMPRESS,BOTH
-# RUN: llvm-mc -triple riscv32 %s -g -o - -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv32 %s -g -o - -M no-aliases \
 # RUN:   | FileCheck %s -check-prefixes=UNCOMPRESS,BOTH
 
 
diff --git a/llvm/test/MC/RISCV/compress-rv32d.s b/llvm/test/MC/RISCV/compress-rv32d.s
index c41a088..2bfae21 100644
--- a/llvm/test/MC/RISCV/compress-rv32d.s
+++ b/llvm/test/MC/RISCV/compress-rv32d.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -filetype=obj < %s \
 # RUN:   | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+c,+d -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -11,7 +11,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -filetype=obj < %s \
 # RUN:   | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+zcd,+d -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -22,7 +22,7 @@
 # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -filetype=obj < %s \
 # RUN:   | llvm-objdump --no-print-imm-hex --triple=riscv64 --mattr=+c,+d -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -32,7 +32,7 @@
 # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -filetype=obj < %s \
 # RUN:   | llvm-objdump --no-print-imm-hex --triple=riscv64 --mattr=+zcd,+d -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/compress-rv32f.s b/llvm/test/MC/RISCV/compress-rv32f.s
index afe15c5..5fc3f41 100644
--- a/llvm/test/MC/RISCV/compress-rv32f.s
+++ b/llvm/test/MC/RISCV/compress-rv32f.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c,+f --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -11,7 +11,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+zcf,+f --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/compress-rv32i.s b/llvm/test/MC/RISCV/compress-rv32i.s
index a75bea3..5a28128 100644
--- a/llvm/test/MC/RISCV/compress-rv32i.s
+++ b/llvm/test/MC/RISCV/compress-rv32i.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS,CHECK-ALIASASM %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST,CHECK-INSTASM %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST,CHECK-INSTASM %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS,CHECK-ALIASOBJ32 %s
@@ -12,7 +12,7 @@
 # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS,CHECK-ALIASASM %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST,CHECK-INSTASM %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST,CHECK-INSTASM %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS,CHECK-ALIASOBJ64 %s
diff --git a/llvm/test/MC/RISCV/compress-rv64i.s b/llvm/test/MC/RISCV/compress-rv64i.s
index ab5b243..31eb4d9 100644
--- a/llvm/test/MC/RISCV/compress-rv64i.s
+++ b/llvm/test/MC/RISCV/compress-rv64i.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/compressed-relocations.s b/llvm/test/MC/RISCV/compressed-relocations.s
index c7117ab..196d987 100644
--- a/llvm/test/MC/RISCV/compressed-relocations.s
+++ b/llvm/test/MC/RISCV/compressed-relocations.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -mattr=+c -riscv-no-aliases < %s -show-encoding \
+# RUN: llvm-mc -triple riscv32 -mattr=+c -M no-aliases < %s -show-encoding \
 # RUN:     | FileCheck -check-prefix=INSTR -check-prefix=FIXUP %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s
diff --git a/llvm/test/MC/RISCV/compressed-zicfiss.s b/llvm/test/MC/RISCV/compressed-zicfiss.s
index 2ebf9d3..7d387b2 100644
--- a/llvm/test/MC/RISCV/compressed-zicfiss.s
+++ b/llvm/test/MC/RISCV/compressed-zicfiss.s
@@ -1,15 +1,15 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zicfiss,+zcmop -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zicfiss,+zcmop -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: c.sspopchk t0
diff --git a/llvm/test/MC/RISCV/corev/XCValu-valid.s b/llvm/test/MC/RISCV/corev/XCValu-valid.s
index 1c74e36..2636b34 100644
--- a/llvm/test/MC/RISCV/corev/XCValu-valid.s
+++ b/llvm/test/MC/RISCV/corev/XCValu-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=riscv32 --mattr=+xcvalu -riscv-no-aliases -show-encoding %s \
+# RUN: llvm-mc -triple=riscv32 --mattr=+xcvalu -M no-aliases -show-encoding %s \
 # RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INSTR
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xcvalu < %s \
 # RUN:     | llvm-objdump --mattr=+xcvalu --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/corev/XCVmac-valid.s b/llvm/test/MC/RISCV/corev/XCVmac-valid.s
index 93ed9d7..a795bc3 100644
--- a/llvm/test/MC/RISCV/corev/XCVmac-valid.s
+++ b/llvm/test/MC/RISCV/corev/XCVmac-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=riscv32 --mattr=+xcvmac -riscv-no-aliases -show-encoding %s \
+# RUN: llvm-mc -triple=riscv32 --mattr=+xcvmac -M no-aliases -show-encoding %s \
 # RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INSTR
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xcvmac < %s \
 # RUN:     | llvm-objdump --mattr=+xcvmac --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/custom_reloc.s b/llvm/test/MC/RISCV/custom_reloc.s
index 4bd4700..cdb8194 100644
--- a/llvm/test/MC/RISCV/custom_reloc.s
+++ b/llvm/test/MC/RISCV/custom_reloc.s
@@ -21,16 +21,33 @@
   .reloc ., R_RISCV_VENDOR,    VENDOR_NAME
   .reloc ., R_RISCV_CUSTOM192, my_foo + 1
   addi a0, a0, 0
-  # CHECK-ASM: [[L1:.L[^:]+]]:
+  # CHECK-ASM:      [[L1:.L[^:]+]]:
   # CHECK-ASM-NEXT: .reloc [[L1]], R_RISCV_VENDOR, VENDOR_NAME
   # CHECK-ASM-NEXT: [[L2:.L[^:]+]]:
   # CHECK-ASM-NEXT: .reloc [[L2]], R_RISCV_CUSTOM192, my_foo+1
   # CHECK-ASM-NEXT: mv a0, a0
 
-  # CHECK-OBJ: addi a0, a0, 0
+  # CHECK-OBJ:      addi a0, a0, 0
   # CHECK-OBJ-NEXT: R_RISCV_VENDOR    VENDOR_NAME
   # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_foo+0x1
 
   nop
   # CHECK-ASM: nop
   # CHECK-OBJ: addi zero, zero, 0x0
+
+  .reloc ., R_RISCV_VENDOR,     QUALCOMM
+  .reloc ., R_RISCV_QC_ABS20_U, my_bar + 2
+  addi a1, a1, 0
+  # CHECK-ASM:      [[L3:.L[^:]+]]:
+  # CHECK-ASM-NEXT: .reloc [[L3]], R_RISCV_VENDOR, QUALCOMM
+  # CHECK-ASM-NEXT: [[L4:.L[^:]+]]:
+  # CHECK-ASM-NEXT: .reloc [[L4]], R_RISCV_QC_ABS20_U, my_bar+2
+  # CHECK-ASM-NEXT: mv a1, a1
+
+  # CHECK-OBJ:      addi a1, a1, 0
+  # CHECK-OBJ-NEXT: R_RISCV_VENDOR    QUALCOMM
+  # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_bar+0x2
+
+  nop
+  # CHECK-ASM: nop
+  # CHECK-OBJ: addi zero, zero, 0x0
diff --git a/llvm/test/MC/RISCV/debug-valid.s b/llvm/test/MC/RISCV/debug-valid.s
index 89b8f00..44dd47d2 100644
--- a/llvm/test/MC/RISCV/debug-valid.s
+++ b/llvm/test/MC/RISCV/debug-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/deprecated-csr-names.s b/llvm/test/MC/RISCV/deprecated-csr-names.s
index e895732..3bb104a 100644
--- a/llvm/test/MC/RISCV/deprecated-csr-names.s
+++ b/llvm/test/MC/RISCV/deprecated-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding %s \
+# RUN: llvm-mc -triple riscv32 -M no-aliases -show-encoding %s \
 # RUN:     | FileCheck -check-prefixes CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype obj -triple riscv32 %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 
-# RUN: llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding %s \
+# RUN: llvm-mc -triple riscv64 -M no-aliases -show-encoding %s \
 # RUN:     | FileCheck -check-prefixes CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype obj -triple riscv64 %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/fixups-binary-expression.s b/llvm/test/MC/RISCV/fixups-binary-expression.s
index dc1de5d..325a54c 100644
--- a/llvm/test/MC/RISCV/fixups-binary-expression.s
+++ b/llvm/test/MC/RISCV/fixups-binary-expression.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -mattr=+c -riscv-no-aliases < %s -show-encoding \
+# RUN: llvm-mc -triple riscv32 -mattr=+c -M no-aliases < %s -show-encoding \
 # RUN:     | FileCheck -check-prefix=CHECK-FIXUP %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/fixups.s b/llvm/test/MC/RISCV/fixups.s
index d0682ed..5145dbe 100644
--- a/llvm/test/MC/RISCV/fixups.s
+++ b/llvm/test/MC/RISCV/fixups.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -riscv-no-aliases < %s -show-encoding \
+# RUN: llvm-mc -triple riscv32 -M no-aliases < %s -show-encoding \
 # RUN:     | FileCheck -check-prefix=CHECK-FIXUP %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/fp-default-rounding-mode.s b/llvm/test/MC/RISCV/fp-default-rounding-mode.s
index c918920..88b681a 100644
--- a/llvm/test/MC/RISCV/fp-default-rounding-mode.s
+++ b/llvm/test/MC/RISCV/fp-default-rounding-mode.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin \
 # RUN:     | FileCheck -check-prefixes=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s b/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s
index d2764ce..2b628e1 100644
--- a/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s
+++ b/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx \
 # RUN:     | FileCheck -check-prefixes=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/hypervisor-csr-names.s b/llvm/test/MC/RISCV/hypervisor-csr-names.s
index 2f29e5d..a7e3a57 100644
--- a/llvm/test/MC/RISCV/hypervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/hypervisor-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/insn.s b/llvm/test/MC/RISCV/insn.s
index 829364c6..b1ef251 100644
--- a/llvm/test/MC/RISCV/insn.s
+++ b/llvm/test/MC/RISCV/insn.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
-# RUN: llvm-mc %s -triple riscv64 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f < %s \
 # RUN:     | llvm-objdump --mattr=+f -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/insn_c.s b/llvm/test/MC/RISCV/insn_c.s
index c63e8ab..c52f9b6 100644
--- a/llvm/test/MC/RISCV/insn_c.s
+++ b/llvm/test/MC/RISCV/insn_c.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+f,+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+f,+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefix=CHECK-ASM %s
-# RUN: llvm-mc %s -triple riscv64 -mattr=+f,+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+f,+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefix=CHECK-ASM %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f,+c < %s \
 # RUN:     | llvm-objdump --mattr=+f,+c -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/machine-csr-names.s b/llvm/test/MC/RISCV/machine-csr-names.s
index 8cfdf7e..ba2a79f 100644
--- a/llvm/test/MC/RISCV/machine-csr-names.s
+++ b/llvm/test/MC/RISCV/machine-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
@@ -1419,6 +1419,34 @@ csrrs t1, tdata3, zero
 # uimm12
 csrrs t2, 0x7A3, zero
 
+# tinfo
+# name
+# CHECK-INST: csrrs t1, tinfo, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x40,0x7a]
+# CHECK-INST-ALIAS: csrr t1, tinfo
+# uimm12
+# CHECK-INST: csrrs t2, tinfo, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x40,0x7a]
+# CHECK-INST-ALIAS: csrr t2, tinfo
+# name
+csrrs t1, tinfo, zero
+# uimm12
+csrrs t2, 0x7A4, zero
+
+# tcontrol
+# name
+# CHECK-INST: csrrs t1, tcontrol, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x50,0x7a]
+# CHECK-INST-ALIAS: csrr t1, tcontrol
+# uimm12
+# CHECK-INST: csrrs t2, tcontrol, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x50,0x7a]
+# CHECK-INST-ALIAS: csrr t2, tcontrol
+# name
+csrrs t1, tcontrol, zero
+# uimm12
+csrrs t2, 0x7A5, zero
+
 # mcontext
 # name
 # CHECK-INST: csrrs t1, mcontext, zero
@@ -1433,6 +1461,20 @@ csrrs t1, mcontext, zero
 # uimm12
 csrrs t2, 0x7A8, zero
 
+# mscontext
+# name
+# CHECK-INST: csrrs t1, mscontext, zero
+# CHECK-ENC: encoding: [0x73,0x23,0xa0,0x7a]
+# CHECK-INST-ALIAS: csrr t1, mscontext
+# uimm12
+# CHECK-INST: csrrs t2, mscontext, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0xa0,0x7a]
+# CHECK-INST-ALIAS: csrr t2, mscontext
+# name
+csrrs t1, mscontext, zero
+# uimm12
+csrrs t2, 0x7AA, zero
+
 #######################
 # Debug Mode Registers
 ########################
diff --git a/llvm/test/MC/RISCV/option-nopic.s b/llvm/test/MC/RISCV/option-nopic.s
index db0cf1d..20203f6 100644
--- a/llvm/test/MC/RISCV/option-nopic.s
+++ b/llvm/test/MC/RISCV/option-nopic.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases \
 # RUN:     -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -position-independent < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases \
 # RUN:     -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -position-independent < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
diff --git a/llvm/test/MC/RISCV/option-pic.s b/llvm/test/MC/RISCV/option-pic.s
index ef456e0..9c9381c 100644
--- a/llvm/test/MC/RISCV/option-pic.s
+++ b/llvm/test/MC/RISCV/option-pic.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases \
 # RUN:     -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -position-independent < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases \
 # RUN:     -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -position-independent < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
diff --git a/llvm/test/MC/RISCV/option-pushpop.s b/llvm/test/MC/RISCV/option-pushpop.s
index 9c61b5d..68d60be 100644
--- a/llvm/test/MC/RISCV/option-pushpop.s
+++ b/llvm/test/MC/RISCV/option-pushpop.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
@@ -6,7 +6,7 @@
 # RUN:     | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+c -d - \
 # RUN:     | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
diff --git a/llvm/test/MC/RISCV/option-rvc.s b/llvm/test/MC/RISCV/option-rvc.s
index 894fbab..32568b9 100644
--- a/llvm/test/MC/RISCV/option-rvc.s
+++ b/llvm/test/MC/RISCV/option-rvc.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -12,7 +12,7 @@
 # RUN: llvm-mc -triple riscv64 -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv64 -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc -triple riscv64 -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/print-imm-hex.s b/llvm/test/MC/RISCV/print-imm-hex.s
index 04d405a..10270de 100644
--- a/llvm/test/MC/RISCV/print-imm-hex.s
+++ b/llvm/test/MC/RISCV/print-imm-hex.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding -mattr=+v \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding -mattr=+v \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding -mattr=+v --print-imm-hex \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding -mattr=+v --print-imm-hex \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM-HEX %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+v < %s \
 # RUN:     | llvm-objdump -M no-aliases --mattr=+v --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/priv-valid.s b/llvm/test/MC/RISCV/priv-valid.s
index 561c76b..67dc501 100644
--- a/llvm/test/MC/RISCV/priv-valid.s
+++ b/llvm/test/MC/RISCV/priv-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+svinval -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+svinval -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+svinval -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+svinval -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+svinval < %s \
 # RUN:     | llvm-objdump --mattr=+svinval -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/relocations.s b/llvm/test/MC/RISCV/relocations.s
index f5f6417..85a25fe 100644
--- a/llvm/test/MC/RISCV/relocations.s
+++ b/llvm/test/MC/RISCV/relocations.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -riscv-no-aliases < %s -show-encoding \
+# RUN: llvm-mc -triple riscv32 -M no-aliases < %s -show-encoding \
 # RUN:     | FileCheck -check-prefix=INSTR -check-prefix=FIXUP %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s
diff --git a/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s b/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
index 83e6d21..aadee4f 100644
--- a/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv32-machine-csr-names.s b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
index e7a6d9c..3d527e3 100644
--- a/llvm/test/MC/RISCV/rv32-machine-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s b/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s
index 4c1fef4..ca7887a 100644
--- a/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv32-user-csr-names.s b/llvm/test/MC/RISCV/rv32-user-csr-names.s
index acd6646..6fb9861 100644
--- a/llvm/test/MC/RISCV/rv32-user-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-user-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv32c-aliases-valid.s b/llvm/test/MC/RISCV/rv32c-aliases-valid.s
index f159adb..de9d0c6 100644
--- a/llvm/test/MC/RISCV/rv32c-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32c-aliases-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc -triple=riscv32 -mattr=+c -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple=riscv32 -mattr=+c -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -d -M no-aliases - \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
-# RUN: llvm-mc -triple=riscv32 -mattr=+zca -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple=riscv32 -mattr=+zca -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zca -d -M no-aliases - \
diff --git a/llvm/test/MC/RISCV/rv32c-only-valid.s b/llvm/test/MC/RISCV/rv32c-only-valid.s
index 3321aff..c4fec69 100644
--- a/llvm/test/MC/RISCV/rv32c-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32c-only-valid.s
@@ -1,17 +1,17 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefix=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefix=CHECK-NO-RV32 %s
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefix=CHECK-NO-RV32-AND-EXT %s
 
 # CHECK-OBJ: c.jal 0x7fe
diff --git a/llvm/test/MC/RISCV/rv32dc-valid.s b/llvm/test/MC/RISCV/rv32dc-valid.s
index 201aee5..495c884 100644
--- a/llvm/test/MC/RISCV/rv32dc-valid.s
+++ b/llvm/test/MC/RISCV/rv32dc-valid.s
@@ -1,18 +1,18 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c,+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+c,+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcd,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcd,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcd,+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zcd,+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-D %s
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-DC %s
 
 # CHECK-ASM-AND-OBJ: c.fldsp  fs0, 504(sp)
diff --git a/llvm/test/MC/RISCV/rv32fc-aliases-valid.s b/llvm/test/MC/RISCV/rv32fc-aliases-valid.s
index d992d07..f1a1c73 100644
--- a/llvm/test/MC/RISCV/rv32fc-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32fc-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c,+f < %s \
 # RUN:     | llvm-objdump --mattr=+c,+f --no-print-imm-hex -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rv32fc-valid.s b/llvm/test/MC/RISCV/rv32fc-valid.s
index 9360325..af38a63 100644
--- a/llvm/test/MC/RISCV/rv32fc-valid.s
+++ b/llvm/test/MC/RISCV/rv32fc-valid.s
@@ -1,22 +1,22 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c,+f < %s \
 # RUN:     | llvm-objdump --mattr=+c,+f --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcf,+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcf,+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcf,+f < %s \
 # RUN:     | llvm-objdump --mattr=+zcf,+f --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-F %s
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-FC %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+c,+f \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-RV32 %s
 
 # FIXME: error messages for rv64fc are misleading
diff --git a/llvm/test/MC/RISCV/rv32i-aliases-invalid.s b/llvm/test/MC/RISCV/rv32i-aliases-invalid.s
index 9254452..7f54fe7 100644
--- a/llvm/test/MC/RISCV/rv32i-aliases-invalid.s
+++ b/llvm/test/MC/RISCV/rv32i-aliases-invalid.s
@@ -1,5 +1,5 @@
 # UNSUPPORTED: target={{.*-windows.*}}
-# RUN: not llvm-mc -triple=riscv32 -riscv-no-aliases < %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=riscv32 -M no-aliases < %s -o /dev/null 2>&1 | FileCheck %s
 # RUN: not llvm-mc -triple=riscv32 < %s -o /dev/null 2>&1 | FileCheck %s
 
 # TODO ld
diff --git a/llvm/test/MC/RISCV/rv32i-aliases-valid.s b/llvm/test/MC/RISCV/rv32i-aliases-valid.s
index 93d8cb2..20deda4 100644
--- a/llvm/test/MC/RISCV/rv32i-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32i-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST,CHECK-ASM-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv32 \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-ALIAS,CHECK-ASM %s
diff --git a/llvm/test/MC/RISCV/rv32i-only-valid.s b/llvm/test/MC/RISCV/rv32i-only-valid.s
index 74232e3..afe62ce 100644
--- a/llvm/test/MC/RISCV/rv32i-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32i-only-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \
 # RUN:     | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s b/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s
index 26a725a..994b46e 100644
--- a/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv32zbb-only-valid.s b/llvm/test/MC/RISCV/rv32zbb-only-valid.s
index 8cee959..0e18cf2 100644
--- a/llvm/test/MC/RISCV/rv32zbb-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbb-only-valid.s
@@ -1,5 +1,5 @@
 # With Bitmanip base extension:
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zbb < %s \
 # RUN:     | llvm-objdump --mattr=+zbb --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s b/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s
index 3ebb297..60599c5 100644
--- a/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv32zcmp-invalid.s b/llvm/test/MC/RISCV/rv32zcmp-invalid.s
index 2ed82bc..0720a74 100644
--- a/llvm/test/MC/RISCV/rv32zcmp-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zcmp-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple=riscv32 -mattr=zcmp -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv32 -mattr=zcmp -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zcmp-valid.s b/llvm/test/MC/RISCV/rv32zcmp-valid.s
index 31e287b..d144c6f 100644
--- a/llvm/test/MC/RISCV/rv32zcmp-valid.s
+++ b/llvm/test/MC/RISCV/rv32zcmp-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=zcmp -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=zcmp -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=zcmp < %s \
 # RUN:     | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv32zfa-only-valid.s b/llvm/test/MC/RISCV/rv32zfa-only-valid.s
index d212659..a780a9f 100644
--- a/llvm/test/MC/RISCV/rv32zfa-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32zfa-only-valid.s
@@ -1,11 +1,11 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: fmvh.x.d a1, fs1
diff --git a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s
index 1cedcb9..048df67 100644
--- a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+c -riscv-no-aliases -show-encoding \
+# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+c -M no-aliases -show-encoding \
 # RUN:     2>&1 | FileCheck -check-prefixes=CHECK-ERR %s
 
 # CHECK-ERR: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zmmul-invaild.s b/llvm/test/MC/RISCV/rv32zmmul-invaild.s
index cf4ced8..b5f1ee6 100644
--- a/llvm/test/MC/RISCV/rv32zmmul-invaild.s
+++ b/llvm/test/MC/RISCV/rv32zmmul-invaild.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -triple=riscv32 -mattr=+zmmul -riscv-no-aliases 2>&1 \
+# RUN: not llvm-mc %s -triple=riscv32 -mattr=+zmmul -M no-aliases 2>&1 \
 # RUN:  | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: 5:1: error: instruction requires the following: 'M' (Integer Multiplication and Division){{$}}
diff --git a/llvm/test/MC/RISCV/rv32zmmul-valid.s b/llvm/test/MC/RISCV/rv32zmmul-valid.s
index 929dc52..b226685 100644
--- a/llvm/test/MC/RISCV/rv32zmmul-valid.s
+++ b/llvm/test/MC/RISCV/rv32zmmul-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zmmul -riscv-no-aliases 2>&1 \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zmmul -M no-aliases 2>&1 \
 # RUN:  | FileCheck -check-prefixes=CHECK-INST %s
 
 # CHECK-INST: mul a4, ra, s0
diff --git a/llvm/test/MC/RISCV/rv64-machine-csr-names.s b/llvm/test/MC/RISCV/rv64-machine-csr-names.s
index 3efebf9..b49eb17 100644
--- a/llvm/test/MC/RISCV/rv64-machine-csr-names.s
+++ b/llvm/test/MC/RISCV/rv64-machine-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/rv64-user-csr-names.s b/llvm/test/MC/RISCV/rv64-user-csr-names.s
index fc23b69..afb7235 100644
--- a/llvm/test/MC/RISCV/rv64-user-csr-names.s
+++ b/llvm/test/MC/RISCV/rv64-user-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv64a-aliases-valid.s b/llvm/test/MC/RISCV/rv64a-aliases-valid.s
index 09999536..577010f 100644
--- a/llvm/test/MC/RISCV/rv64a-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64a-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+a \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rv64c-aliases-valid.s b/llvm/test/MC/RISCV/rv64c-aliases-valid.s
index ccf9e6a..6856697 100644
--- a/llvm/test/MC/RISCV/rv64c-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64c-aliases-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc -triple=riscv64 -mattr=+c -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple=riscv64 -mattr=+c -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -d -M no-aliases - \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
-# RUN: llvm-mc -triple=riscv64 -mattr=+zca -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple=riscv64 -mattr=+zca -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zca -d -M no-aliases - \
diff --git a/llvm/test/MC/RISCV/rv64c-hints-valid.s b/llvm/test/MC/RISCV/rv64c-hints-valid.s
index 92cbe542..95d0932 100644
--- a/llvm/test/MC/RISCV/rv64c-hints-valid.s
+++ b/llvm/test/MC/RISCV/rv64c-hints-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple riscv64 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+zca -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zca -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64c-valid.s b/llvm/test/MC/RISCV/rv64c-valid.s
index a9f0cf5..f8736e5 100644
--- a/llvm/test/MC/RISCV/rv64c-valid.s
+++ b/llvm/test/MC/RISCV/rv64c-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
@@ -11,10 +11,10 @@
 #
 #
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv32 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-RV64 %s
 
 # TODO: more exhaustive testing of immediate encoding.
diff --git a/llvm/test/MC/RISCV/rv64d-aliases-valid.s b/llvm/test/MC/RISCV/rv64d-aliases-valid.s
index 17a44b4..c23aa84 100644
--- a/llvm/test/MC/RISCV/rv64d-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64d-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+d \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rv64d-valid.s b/llvm/test/MC/RISCV/rv64d-valid.s
index e6cc8ec..8a9f608 100644
--- a/llvm/test/MC/RISCV/rv64d-valid.s
+++ b/llvm/test/MC/RISCV/rv64d-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+d < %s \
 # RUN:     | llvm-objdump --mattr=+d -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64dc-valid.s b/llvm/test/MC/RISCV/rv64dc-valid.s
index 83225b2..7f2b8c4 100644
--- a/llvm/test/MC/RISCV/rv64dc-valid.s
+++ b/llvm/test/MC/RISCV/rv64dc-valid.s
@@ -1,18 +1,18 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c,+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+c,+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zcd,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zcd,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zcd,+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zcd,+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv64 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-D %s
-# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-DC %s
 
 # CHECK-ASM-AND-OBJ: c.fldsp  fs0, 504(sp)
diff --git a/llvm/test/MC/RISCV/rv64e-valid.s b/llvm/test/MC/RISCV/rv64e-valid.s
index f7b66fd..42f9b2c 100644
--- a/llvm/test/MC/RISCV/rv64e-valid.s
+++ b/llvm/test/MC/RISCV/rv64e-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+e -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+e -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+e < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64e-zcmp-valid.s b/llvm/test/MC/RISCV/rv64e-zcmp-valid.s
index 376edf0..607a023 100644
--- a/llvm/test/MC/RISCV/rv64e-zcmp-valid.s
+++ b/llvm/test/MC/RISCV/rv64e-zcmp-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp,+e -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp,+e -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=zcmp < %s \
 # RUN:     | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64f-aliases-valid.s b/llvm/test/MC/RISCV/rv64f-aliases-valid.s
index 488d52f..d04b0b5 100644
--- a/llvm/test/MC/RISCV/rv64f-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64f-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+f \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rv64f-valid.s b/llvm/test/MC/RISCV/rv64f-valid.s
index 108e1eb..1fea787 100644
--- a/llvm/test/MC/RISCV/rv64f-valid.s
+++ b/llvm/test/MC/RISCV/rv64f-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+f < %s \
 # RUN:     | llvm-objdump --mattr=+f -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64i-aliases-invalid.s b/llvm/test/MC/RISCV/rv64i-aliases-invalid.s
index 34552b5..1bd4e78 100644
--- a/llvm/test/MC/RISCV/rv64i-aliases-invalid.s
+++ b/llvm/test/MC/RISCV/rv64i-aliases-invalid.s
@@ -1,5 +1,5 @@
 # UNSUPPORTED: target={{.*-windows.*}}
-# RUN: not llvm-mc -triple=riscv64 -riscv-no-aliases < %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=riscv64 -M no-aliases < %s -o /dev/null 2>&1 | FileCheck %s
 # RUN: not llvm-mc -triple=riscv64 < %s 2>&1 -o /dev/null | FileCheck %s
 
 li t5, 0x10000000000000000 # CHECK: :[[@LINE]]:8: error: unknown operand
diff --git a/llvm/test/MC/RISCV/rv64i-aliases-valid.s b/llvm/test/MC/RISCV/rv64i-aliases-valid.s
index f36446d..dde8dbe 100644
--- a/llvm/test/MC/RISCV/rv64i-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64i-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST,CHECK-ASM-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv64 \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-ALIAS,CHECK-ASM %s
diff --git a/llvm/test/MC/RISCV/rv64i-valid.s b/llvm/test/MC/RISCV/rv64i-valid.s
index ec101f9..7f94fbf 100644
--- a/llvm/test/MC/RISCV/rv64i-valid.s
+++ b/llvm/test/MC/RISCV/rv64i-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64ih-valid.s b/llvm/test/MC/RISCV/rv64ih-valid.s
index be8ccf9..13ca4f0 100644
--- a/llvm/test/MC/RISCV/rv64ih-valid.s
+++ b/llvm/test/MC/RISCV/rv64ih-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -mattr=+h -triple riscv64 < %s \
 # RUN:     | llvm-objdump --mattr=+h -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rv64m-valid.s b/llvm/test/MC/RISCV/rv64m-valid.s
index 246f74f..21db064 100644
--- a/llvm/test/MC/RISCV/rv64m-valid.s
+++ b/llvm/test/MC/RISCV/rv64m-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m < %s \
 # RUN:     | llvm-objdump --mattr=+m -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zaamo-valid.s b/llvm/test/MC/RISCV/rv64zaamo-valid.s
index 96d3e61..c401ce1 100644
--- a/llvm/test/MC/RISCV/rv64zaamo-valid.s
+++ b/llvm/test/MC/RISCV/rv64zaamo-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
@@ -7,7 +7,7 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 \
 # RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zaamo < %s \
 # RUN:     | llvm-objdump --mattr=+zaamo -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zacas-valid.s b/llvm/test/MC/RISCV/rv64zacas-valid.s
index 595c70b..d90e778 100644
--- a/llvm/test/MC/RISCV/rv64zacas-valid.s
+++ b/llvm/test/MC/RISCV/rv64zacas-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zacas < %s \
 # RUN:     | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zalasr-valid.s b/llvm/test/MC/RISCV/rv64zalasr-valid.s
index 2f1e381..13d2b21 100644
--- a/llvm/test/MC/RISCV/rv64zalasr-valid.s
+++ b/llvm/test/MC/RISCV/rv64zalasr-valid.s
@@ -1,11 +1,11 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zalasr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefixes=CHECK-NO-EXT %s
 
 
diff --git a/llvm/test/MC/RISCV/rv64zalrsc-valid.s b/llvm/test/MC/RISCV/rv64zalrsc-valid.s
index 2bbde96..98ac38d 100644
--- a/llvm/test/MC/RISCV/rv64zalrsc-valid.s
+++ b/llvm/test/MC/RISCV/rv64zalrsc-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
@@ -7,7 +7,7 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 \
 # RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zalrsc < %s \
 # RUN:     | llvm-objdump --mattr=+zalrsc -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zba-aliases-valid.s b/llvm/test/MC/RISCV/rv64zba-aliases-valid.s
index bb8eeb4..78ae18b 100644
--- a/llvm/test/MC/RISCV/rv64zba-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64zba-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zba -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zba -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s  -triple=riscv64 -mattr=+zba \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s b/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s
index 662f360..50d6716 100644
--- a/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s  -triple=riscv64 -mattr=+zbb \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv64zbb-valid.s b/llvm/test/MC/RISCV/rv64zbb-valid.s
index 6c7327f..5617e11 100644
--- a/llvm/test/MC/RISCV/rv64zbb-valid.s
+++ b/llvm/test/MC/RISCV/rv64zbb-valid.s
@@ -1,5 +1,5 @@
 # With Bitmanip base extension:
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zbb < %s \
 # RUN:     | llvm-objdump --mattr=+zbb -M no-aliases --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s b/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s
index 0bfd3ac..0379a06 100644
--- a/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbs -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbs -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s  -triple=riscv64 -mattr=+zbs \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv64zcb-valid.s b/llvm/test/MC/RISCV/rv64zcb-valid.s
index ab0550e..83e7fd6 100644
--- a/llvm/test/MC/RISCV/rv64zcb-valid.s
+++ b/llvm/test/MC/RISCV/rv64zcb-valid.s
@@ -1,14 +1,14 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb < %s \
 # RUN:     | llvm-objdump --mattr=+m,+zbb,+zba,zcb -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv32 -mattr=+m,+zbb,+zba,+zcb \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-RV64 %s
 
 # CHECK-ASM-AND-OBJ: c.zext.w s0
diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s
index 8f353e8..7e10ab5c 100644
--- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple=riscv64 -mattr=zcmp -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv64 -mattr=zcmp -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zcmp-valid.s b/llvm/test/MC/RISCV/rv64zcmp-valid.s
index 5973f6d..c70a904 100644
--- a/llvm/test/MC/RISCV/rv64zcmp-valid.s
+++ b/llvm/test/MC/RISCV/rv64zcmp-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=zcmp < %s \
 # RUN:     | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zdinx-valid.s b/llvm/test/MC/RISCV/rv64zdinx-valid.s
index fa603f3..411f424 100644
--- a/llvm/test/MC/RISCV/rv64zdinx-valid.s
+++ b/llvm/test/MC/RISCV/rv64zdinx-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zdinx %s \
 # RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zfh-valid.s b/llvm/test/MC/RISCV/rv64zfh-valid.s
index 5a15040..5fb8ba5 100644
--- a/llvm/test/MC/RISCV/rv64zfh-valid.s
+++ b/llvm/test/MC/RISCV/rv64zfh-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfh -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zfinx-valid.s b/llvm/test/MC/RISCV/rv64zfinx-valid.s
index d2de9a3..63006e9 100644
--- a/llvm/test/MC/RISCV/rv64zfinx-valid.s
+++ b/llvm/test/MC/RISCV/rv64zfinx-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfinx %s \
 # RUN:     | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zhinx-valid.s b/llvm/test/MC/RISCV/rv64zhinx-valid.s
index cba252f..c6aa559 100644
--- a/llvm/test/MC/RISCV/rv64zhinx-valid.s
+++ b/llvm/test/MC/RISCV/rv64zhinx-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx %s \
 # RUN:     | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zhinxmin-valid.s b/llvm/test/MC/RISCV/rv64zhinxmin-valid.s
index 062844f..3489549 100644
--- a/llvm/test/MC/RISCV/rv64zhinxmin-valid.s
+++ b/llvm/test/MC/RISCV/rv64zhinxmin-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx,+zdinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx,+zdinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx,+zdinx %s \
 # RUN:     | llvm-objdump --mattr=+zhinx,+zdinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s
index 1296940..fc69c68 100644
--- a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+c -riscv-no-aliases -show-encoding \
+# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+c -M no-aliases -show-encoding \
 # RUN:     2>&1 | FileCheck -check-prefixes=CHECK-ERR %s
 
 # CHECK-ERR: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zmmul-invalid.s b/llvm/test/MC/RISCV/rv64zmmul-invalid.s
index 026b0a4..3d27ed9 100644
--- a/llvm/test/MC/RISCV/rv64zmmul-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zmmul-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -triple=riscv64 -mattr=+zmmul -riscv-no-aliases 2>&1 \
+# RUN: not llvm-mc %s -triple=riscv64 -mattr=+zmmul -M no-aliases 2>&1 \
 # RUN:  | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: 5:1: error: instruction requires the following: 'M' (Integer Multiplication and Division){{$}}
diff --git a/llvm/test/MC/RISCV/rv64zmmul-valid.s b/llvm/test/MC/RISCV/rv64zmmul-valid.s
index 80d05ac..b287d89 100644
--- a/llvm/test/MC/RISCV/rv64zmmul-valid.s
+++ b/llvm/test/MC/RISCV/rv64zmmul-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zmmul -riscv-no-aliases 2>&1 \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zmmul -M no-aliases 2>&1 \
 # RUN:  | FileCheck -check-prefixes=CHECK-INST %s
 
 # CHECK-INST: mulw ra, sp, gp
diff --git a/llvm/test/MC/RISCV/rva-aliases-valid.s b/llvm/test/MC/RISCV/rva-aliases-valid.s
index 57d96b5..7b33407 100644
--- a/llvm/test/MC/RISCV/rva-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rva-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-NOALIAS,CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+a \
 # RUN:     | FileCheck -check-prefixes=CHECK-S,CHECK-S-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases\
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases\
 # RUN:     | FileCheck -check-prefixes=CHECK-S-NOALIAS,CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+a \
 # RUN:     | FileCheck -check-prefixes=CHECK-S,CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rvc-aliases-valid.s b/llvm/test/MC/RISCV/rvc-aliases-valid.s
index fa73922..23398f9 100644
--- a/llvm/test/MC/RISCV/rvc-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvc-aliases-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rvc-hints-valid.s b/llvm/test/MC/RISCV/rvc-hints-valid.s
index 562fe65..5dc86d1 100644
--- a/llvm/test/MC/RISCV/rvc-hints-valid.s
+++ b/llvm/test/MC/RISCV/rvc-hints-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple riscv64 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/rvc-valid.s b/llvm/test/MC/RISCV/rvc-valid.s
index 9b0ca80..798bff8 100644
--- a/llvm/test/MC/RISCV/rvc-valid.s
+++ b/llvm/test/MC/RISCV/rvc-valid.s
@@ -1,26 +1,26 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zca -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zca -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --mattr=+zca --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --mattr=+zca --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # TODO: more exhaustive testing of immediate encoding.
diff --git a/llvm/test/MC/RISCV/rvd-aliases-valid.s b/llvm/test/MC/RISCV/rvd-aliases-valid.s
index 58478195..9832b73 100644
--- a/llvm/test/MC/RISCV/rvd-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvd-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+d \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+d \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvd-valid.s b/llvm/test/MC/RISCV/rvd-valid.s
index f782900..5135562 100644
--- a/llvm/test/MC/RISCV/rvd-valid.s
+++ b/llvm/test/MC/RISCV/rvd-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+d -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvdc-aliases-valid.s b/llvm/test/MC/RISCV/rvdc-aliases-valid.s
index f74ee05..083c4d2 100644
--- a/llvm/test/MC/RISCV/rvdc-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvdc-aliases-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c,+d < %s \
 # RUN:     | llvm-objdump --mattr=+c,+d --no-print-imm-hex -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rve-valid.s b/llvm/test/MC/RISCV/rve-valid.s
index ccb47f1..d151c07 100644
--- a/llvm/test/MC/RISCV/rve-valid.s
+++ b/llvm/test/MC/RISCV/rve-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+e -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+e -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+e < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck -check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+e -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+e -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+e < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvf-aliases-valid.s b/llvm/test/MC/RISCV/rvf-aliases-valid.s
index 0430e2a..e0b63ec 100644
--- a/llvm/test/MC/RISCV/rvf-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvf-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+f \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+f \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvf-user-csr-names.s b/llvm/test/MC/RISCV/rvf-user-csr-names.s
index 7b7569d..697c333 100644
--- a/llvm/test/MC/RISCV/rvf-user-csr-names.s
+++ b/llvm/test/MC/RISCV/rvf-user-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+f -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+f -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+f < %s \
 # RUN:     | llvm-objdump -d --mattr=+f - \
@@ -7,7 +7,7 @@
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS-NO-F %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+f -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+f -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+f < %s \
 # RUN:     | llvm-objdump -d --mattr=+f - \
diff --git a/llvm/test/MC/RISCV/rvf-valid.s b/llvm/test/MC/RISCV/rvf-valid.s
index 77b5df0..49b67a0 100644
--- a/llvm/test/MC/RISCV/rvf-valid.s
+++ b/llvm/test/MC/RISCV/rvf-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f < %s \
 # RUN:     | llvm-objdump --mattr=+f --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvi-valid.s b/llvm/test/MC/RISCV/rvi-valid.s
index 86b508a..25b72d4 100644
--- a/llvm/test/MC/RISCV/rvi-valid.s
+++ b/llvm/test/MC/RISCV/rvi-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \
 # RUN:     | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/rvih-valid.s b/llvm/test/MC/RISCV/rvih-valid.s
index 6f80a24..a3a9d29 100644
--- a/llvm/test/MC/RISCV/rvih-valid.s
+++ b/llvm/test/MC/RISCV/rvih-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+h -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+h -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+h < %s \
 # RUN:     | llvm-objdump --mattr=+h -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rvk-user-csr-name.s b/llvm/test/MC/RISCV/rvk-user-csr-name.s
index 0615da0..0a3cb2d 100644
--- a/llvm/test/MC/RISCV/rvk-user-csr-name.s
+++ b/llvm/test/MC/RISCV/rvk-user-csr-name.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zkr < %s \
 # RUN:     | llvm-objdump -d --mattr=+zkr - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zkr < %s \
 # RUN:     | llvm-objdump -d --mattr=+zkr - \
diff --git a/llvm/test/MC/RISCV/rvm-valid.s b/llvm/test/MC/RISCV/rvm-valid.s
index 8fb54bb..d0612c4 100644
--- a/llvm/test/MC/RISCV/rvm-valid.s
+++ b/llvm/test/MC/RISCV/rvm-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+m -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+m -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+m < %s \
 # RUN:     | llvm-objdump --mattr=+m -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvv-user-csr-names.s b/llvm/test/MC/RISCV/rvv-user-csr-names.s
index 71d4033..1f691bb 100644
--- a/llvm/test/MC/RISCV/rvv-user-csr-names.s
+++ b/llvm/test/MC/RISCV/rvv-user-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+f -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+f -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+v < %s \
 # RUN:     | llvm-objdump -d --mattr=+v - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+f -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+f -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+v < %s \
 # RUN:     | llvm-objdump -d --mattr=+v - \
diff --git a/llvm/test/MC/RISCV/rvv/aliases.s b/llvm/test/MC/RISCV/rvv/aliases.s
index 0dadeb1..c36bdb53 100644
--- a/llvm/test/MC/RISCV/rvv/aliases.s
+++ b/llvm/test/MC/RISCV/rvv/aliases.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc --triple=riscv64 -mattr +v < %s --show-encoding 2>&1 \
 # RUN:   -mattr +d | FileCheck --check-prefix=ALIAS %s
-# RUN: llvm-mc --triple=riscv64 -mattr=+v --riscv-no-aliases < %s \
+# RUN: llvm-mc --triple=riscv64 -mattr=+v --M no-aliases < %s \
 # RUN:   -mattr +d --show-encoding 2>&1 | FileCheck --check-prefix=NO-ALIAS %s
 
 # ALIAS:    vwcvt.x.x.v     v2, v1, v0.t    # encoding: [0x57,0x61,0x10,0xc4]
diff --git a/llvm/test/MC/RISCV/rvv/fothers.s b/llvm/test/MC/RISCV/rvv/fothers.s
index 997115f..0236d31 100644
--- a/llvm/test/MC/RISCV/rvv/fothers.s
+++ b/llvm/test/MC/RISCV/rvv/fothers.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --mattr=+f --riscv-no-aliases \
+# RUN:   --mattr=+f --M no-aliases \
 # RUN:   | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvv/freduction.s b/llvm/test/MC/RISCV/rvv/freduction.s
index 1232694..190d60f 100644
--- a/llvm/test/MC/RISCV/rvv/freduction.s
+++ b/llvm/test/MC/RISCV/rvv/freduction.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:         --mattr=+f --riscv-no-aliases \
+# RUN:         --mattr=+f --M no-aliases \
 # RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvv/load.s b/llvm/test/MC/RISCV/rvv/load.s
index 3c251a3..9cd0ab4 100644
--- a/llvm/test/MC/RISCV/rvv/load.s
+++ b/llvm/test/MC/RISCV/rvv/load.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN:   --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \
diff --git a/llvm/test/MC/RISCV/rvv/others.s b/llvm/test/MC/RISCV/rvv/others.s
index cc16a87..0414585 100644
--- a/llvm/test/MC/RISCV/rvv/others.s
+++ b/llvm/test/MC/RISCV/rvv/others.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN:   --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \
diff --git a/llvm/test/MC/RISCV/rvv/store.s b/llvm/test/MC/RISCV/rvv/store.s
index c6a3470..ca9bb13 100644
--- a/llvm/test/MC/RISCV/rvv/store.s
+++ b/llvm/test/MC/RISCV/rvv/store.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN:   --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \
diff --git a/llvm/test/MC/RISCV/rvv/zvlsseg.s b/llvm/test/MC/RISCV/rvv/zvlsseg.s
index 65089e2..479d2f9 100644
--- a/llvm/test/MC/RISCV/rvv/zvlsseg.s
+++ b/llvm/test/MC/RISCV/rvv/zvlsseg.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --riscv-no-aliases \
+# RUN:   --M no-aliases \
 # RUN:   | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvzaamo-valid.s b/llvm/test/MC/RISCV/rvzaamo-valid.s
index d9ba6ef..e4805aa 100644
--- a/llvm/test/MC/RISCV/rvzaamo-valid.s
+++ b/llvm/test/MC/RISCV/rvzaamo-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
@@ -8,9 +8,9 @@
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zaamo -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zaamo -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zaamo < %s \
 # RUN:     | llvm-objdump --mattr=+zaamo -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzabha-valid.s b/llvm/test/MC/RISCV/rvzabha-valid.s
index a3c61db..2b1b1e0 100644
--- a/llvm/test/MC/RISCV/rvzabha-valid.s
+++ b/llvm/test/MC/RISCV/rvzabha-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha < %s \
 # RUN:     | llvm-objdump --mattr=+a,+zabha -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
index 97afb9d..4e271e4 100644
--- a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
+++ b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha,+zacas < %s \
 # RUN:     | llvm-objdump --mattr=+a,+zabha,+zacas -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzacas-valid.s b/llvm/test/MC/RISCV/rvzacas-valid.s
index 0e76f02..2524001 100644
--- a/llvm/test/MC/RISCV/rvzacas-valid.s
+++ b/llvm/test/MC/RISCV/rvzacas-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zacas < %s \
 # RUN:     | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzalasr-valid.s b/llvm/test/MC/RISCV/rvzalasr-valid.s
index 7b2668b..11487ee 100644
--- a/llvm/test/MC/RISCV/rvzalasr-valid.s
+++ b/llvm/test/MC/RISCV/rvzalasr-valid.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zalasr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zalasr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zalasr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: lb.aq t1, (a0)
diff --git a/llvm/test/MC/RISCV/rvzalrsc-valid.s b/llvm/test/MC/RISCV/rvzalrsc-valid.s
index f84c0fd..01883da 100644
--- a/llvm/test/MC/RISCV/rvzalrsc-valid.s
+++ b/llvm/test/MC/RISCV/rvzalrsc-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
@@ -8,9 +8,9 @@
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zalrsc -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zalrsc -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zalrsc < %s \
 # RUN:     | llvm-objdump --mattr=+zalrsc -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzbb-valid.s b/llvm/test/MC/RISCV/rvzbb-valid.s
index 1ed069e..1b060be 100644
--- a/llvm/test/MC/RISCV/rvzbb-valid.s
+++ b/llvm/test/MC/RISCV/rvzbb-valid.s
@@ -1,7 +1,7 @@
 # With Bitmanip base extension:
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zbb < %s \
 # RUN:     | llvm-objdump --mattr=+zbb --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzcb-invalid.s b/llvm/test/MC/RISCV/rvzcb-invalid.s
index 2f543b2..f53ab25 100644
--- a/llvm/test/MC/RISCV/rvzcb-invalid.s
+++ b/llvm/test/MC/RISCV/rvzcb-invalid.s
@@ -1,6 +1,6 @@
-# RUN: not llvm-mc -triple=riscv32 -mattr=zcb -riscv-no-aliases -show-encoding %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv32 -mattr=zcb -M no-aliases -show-encoding %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
-# RUN: not llvm-mc -triple=riscv64 -mattr=zcb -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv64 -mattr=zcb -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: error: immediate must be an integer in the range [0, 3]
diff --git a/llvm/test/MC/RISCV/rvzcb-valid.s b/llvm/test/MC/RISCV/rvzcb-valid.s
index de25a38..b78ecef 100644
--- a/llvm/test/MC/RISCV/rvzcb-valid.s
+++ b/llvm/test/MC/RISCV/rvzcb-valid.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb < %s \
 # RUN:     | llvm-objdump --mattr=+m,+zbb,+zba,+zcb --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb < %s \
 # RUN:     | llvm-objdump --mattr=+m,+zbb,+zba,zcb --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: c.zext.b s0
diff --git a/llvm/test/MC/RISCV/rvzcmt-invalid.s b/llvm/test/MC/RISCV/rvzcmt-invalid.s
index 5f964ed..0cd9f0b 100644
--- a/llvm/test/MC/RISCV/rvzcmt-invalid.s
+++ b/llvm/test/MC/RISCV/rvzcmt-invalid.s
@@ -1,6 +1,6 @@
-# RUN: not llvm-mc -triple=riscv32 -mattr=+zcmt -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv32 -mattr=+zcmt -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
-# RUN: not llvm-mc -triple=riscv64 -mattr=+zcmt -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv64 -mattr=+zcmt -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: error: immediate must be an integer in the range [0, 31]
diff --git a/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s b/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s
index 58fe43e..c1ab6bc 100644
--- a/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s
+++ b/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+zcmt -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+zcmt -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zcmt < %s \
 # RUN:     | llvm-objdump -d --mattr=+zcmt - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+zcmt -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+zcmt -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zcmt < %s \
 # RUN:     | llvm-objdump -d --mattr=+zcmt - \
diff --git a/llvm/test/MC/RISCV/rvzcmt-valid.s b/llvm/test/MC/RISCV/rvzcmt-valid.s
index a3829fe..a87e790 100644
--- a/llvm/test/MC/RISCV/rvzcmt-valid.s
+++ b/llvm/test/MC/RISCV/rvzcmt-valid.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zcmt\
-# RUN:  -riscv-no-aliases -show-encoding \
+# RUN:  -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcmt\
 # RUN:  -mattr=m < %s \
@@ -7,7 +7,7 @@
 # RUN:  -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zcmt\
-# RUN:  -riscv-no-aliases -show-encoding \
+# RUN:  -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zcmt\
 # RUN:  -mattr=m < %s \
@@ -16,10 +16,10 @@
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: cm.jt 1
diff --git a/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
index 96ec4a4..a24e36b 100644
--- a/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvzdinx-valid.s b/llvm/test/MC/RISCV/rvzdinx-valid.s
index bd1e231..623e281 100644
--- a/llvm/test/MC/RISCV/rvzdinx-valid.s
+++ b/llvm/test/MC/RISCV/rvzdinx-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zdinx %s \
 # RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zdinx %s \
 # RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzfbfmin-valid.s b/llvm/test/MC/RISCV/rvzfbfmin-valid.s
index aa8f8cc..6bca691 100644
--- a/llvm/test/MC/RISCV/rvzfbfmin-valid.s
+++ b/llvm/test/MC/RISCV/rvzfbfmin-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfbfmin -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfbfmin -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfbfmin -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfbfmin -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfbfmin,+f < %s \
 # RUN:     | llvm-objdump --mattr=+zfbfmin --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzfh-aliases-valid.s b/llvm/test/MC/RISCV/rvzfh-aliases-valid.s
index 4e33375..44b3cc1 100644
--- a/llvm/test/MC/RISCV/rvzfh-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvzfh-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvzfh-valid.s b/llvm/test/MC/RISCV/rvzfh-valid.s
index ec21f40..ccc6d10 100644
--- a/llvm/test/MC/RISCV/rvzfh-valid.s
+++ b/llvm/test/MC/RISCV/rvzfh-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfh --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzfhmin-valid.s b/llvm/test/MC/RISCV/rvzfhmin-valid.s
index 63e5e98..fd7b36a 100644
--- a/llvm/test/MC/RISCV/rvzfhmin-valid.s
+++ b/llvm/test/MC/RISCV/rvzfhmin-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfhmin,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfhmin,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfhmin,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfhmin,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfhmin,+d < %s \
 # RUN:     | llvm-objdump --mattr=+zfhmin,+d --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
index f9225cf..83e1660 100644
--- a/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvzfinx-valid.s b/llvm/test/MC/RISCV/rvzfinx-valid.s
index 58f805c..09a5f9e 100644
--- a/llvm/test/MC/RISCV/rvzfinx-valid.s
+++ b/llvm/test/MC/RISCV/rvzfinx-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfinx %s \
 # RUN:     | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s
index dbefc5a..8f69558 100644
--- a/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvzhinx-valid.s b/llvm/test/MC/RISCV/rvzhinx-valid.s
index 97ec9dd..dc244b2 100644
--- a/llvm/test/MC/RISCV/rvzhinx-valid.s
+++ b/llvm/test/MC/RISCV/rvzhinx-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zhinx %s \
 # RUN:     | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzhinxmin-valid.s b/llvm/test/MC/RISCV/rvzhinxmin-valid.s
index fbdbce0..1773b29 100644
--- a/llvm/test/MC/RISCV/rvzhinxmin-valid.s
+++ b/llvm/test/MC/RISCV/rvzhinxmin-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinxmin -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinxmin -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinxmin -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinxmin -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zhinxmin %s \
 # RUN:     | llvm-objdump --mattr=+zhinxmin -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzihintntl-valid.s b/llvm/test/MC/RISCV/rvzihintntl-valid.s
index f7601c3..415070a 100644
--- a/llvm/test/MC/RISCV/rvzihintntl-valid.s
+++ b/llvm/test/MC/RISCV/rvzihintntl-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintntl -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintntl -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintntl -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintntl -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zihintntl < %s \
 # RUN:     | llvm-objdump --mattr=+zihintntl -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzihintpause-valid.s b/llvm/test/MC/RISCV/rvzihintpause-valid.s
index 3ffc387..44cebae 100644
--- a/llvm/test/MC/RISCV/rvzihintpause-valid.s
+++ b/llvm/test/MC/RISCV/rvzihintpause-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintpause -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintpause -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintpause -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintpause -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zihintpause < %s \
 # RUN:     | llvm-objdump --mattr=+zihintpause -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/smctr-ssctr-valid.s b/llvm/test/MC/RISCV/smctr-ssctr-valid.s
index 0b4fe47..8bbd5a4 100644
--- a/llvm/test/MC/RISCV/smctr-ssctr-valid.s
+++ b/llvm/test/MC/RISCV/smctr-ssctr-valid.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-smctr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-smctr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-smctr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-smctr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-ssctr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-ssctr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-ssctr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-ssctr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-smctr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-smctr -M no-aliases -d - \
@@ -19,9 +19,9 @@
 # RUN:     | llvm-objdump --mattr=+experimental-ssctr -M no-aliases -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-INST: sctrclr
diff --git a/llvm/test/MC/RISCV/smrnmi-valid.s b/llvm/test/MC/RISCV/smrnmi-valid.s
index d330ece..8c57a4d 100644
--- a/llvm/test/MC/RISCV/smrnmi-valid.s
+++ b/llvm/test/MC/RISCV/smrnmi-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+smrnmi -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+smrnmi -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+smrnmi -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+smrnmi -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+smrnmi < %s \
 # RUN:     | llvm-objdump --mattr=+smrnmi -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/supervisor-csr-names.s b/llvm/test/MC/RISCV/supervisor-csr-names.s
index db0fcb3..712ec56 100644
--- a/llvm/test/MC/RISCV/supervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/supervisor-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/user-csr-names.s b/llvm/test/MC/RISCV/user-csr-names.s
index f49eace..bc7363f 100644
--- a/llvm/test/MC/RISCV/user-csr-names.s
+++ b/llvm/test/MC/RISCV/user-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/xqcia-valid.s b/llvm/test/MC/RISCV/xqcia-valid.s
index 6bd1049..9382856 100644
--- a/llvm/test/MC/RISCV/xqcia-valid.s
+++ b/llvm/test/MC/RISCV/xqcia-valid.s
@@ -1,5 +1,5 @@
 # Xqcia - Qualcomm uC Arithmetic Extesnsion
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcia -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcia -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcia < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcia -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqciac-invalid.s b/llvm/test/MC/RISCV/xqciac-invalid.s
new file mode 100644
index 0000000..4e0182a
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqciac-invalid.s
@@ -0,0 +1,43 @@
+# Xqciac - Qualcomm uC Load-Store Address Calculation Extension
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqciac < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-IMM %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqciac < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.c.muladdi x5, x10, 4
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.muladdi x15
+
+# CHECK-IMM: :[[@LINE+1]]:24: error: immediate must be an integer in the range [0, 31]
+qc.c.muladdi x10, x15, 32
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.c.muladdi x10, x15, 20
+
+
+# CHECK: :[[@LINE+1]]:12: error: invalid operand for instruction
+qc.muladdi x0, x10, 1048577
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.muladdi x10
+
+# CHECK-IMM: :[[@LINE+1]]:22: error: operand must be a symbol with %lo/%pcrel_lo/%tprel_lo modifier or an integer in the range [-2048, 2047]
+qc.muladdi x10, x15, 8589934592
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.muladdi x10, x15, 577
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.shladd 0, x10, 1048577
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.shladd x10
+
+# CHECK-IMM: :[[@LINE+1]]:26: error: immediate must be an integer in the range [4, 31]
+qc.shladd x10, x15, x11, 2
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.shladd x10, x15, x11, 5
diff --git a/llvm/test/MC/RISCV/xqciac-valid.s b/llvm/test/MC/RISCV/xqciac-valid.s
new file mode 100644
index 0000000..6e97d8c
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqciac-valid.s
@@ -0,0 +1,49 @@
+# Xqciac - Qualcomm uC Load-Store Address Calculation Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciac -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciac < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqciac -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciac -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciac < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqciac --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 0
+# CHECK-ENC: encoding: [0x8a,0x21]
+qc.c.muladdi x10, x11, 0
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 31
+# CHECK-ENC: encoding: [0xea,0x3d]
+qc.c.muladdi x10, x11, 31
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 16
+# CHECK-ENC: encoding: [0xaa,0x21]
+qc.c.muladdi x10, x11, 16
+
+
+# CHECK-INST: qc.muladdi      tp, t0, 1234
+# CHECK-ENC: encoding: [0x0b,0xe2,0x22,0x4d]
+qc.muladdi x4, x5, 1234
+
+# CHECK-INST: qc.muladdi      a0, a1, -2048
+# CHECK-ENC: encoding: [0x0b,0xe5,0x05,0x80]
+qc.muladdi x10, x11, -2048
+
+# CHECK-INST: qc.muladdi      a0, a1, 2047
+# CHECK-ENC: encoding: [0x0b,0xe5,0xf5,0x7f]
+qc.muladdi x10, x11, 2047
+
+
+# CHECK-INST: qc.shladd       tp, t0, t1, 12
+# CHECK-ENC: encoding: [0x0b,0xb2,0x62,0x58]
+qc.shladd x4, x5, x6, 12
+
+# CHECK-INST: qc.shladd       a0, a1, a2, 4
+# CHECK-ENC: encoding: [0x0b,0xb5,0xc5,0x48]
+qc.shladd x10, x11, x12, 4
+
+# CHECK-INST: qc.shladd       a0, a1, a2, 31
+# CHECK-ENC: encoding: [0x0b,0xb5,0xc5,0x7e]
+qc.shladd x10, x11, x12, 31
diff --git a/llvm/test/MC/RISCV/xqcicli-invalid.s b/llvm/test/MC/RISCV/xqcicli-invalid.s
new file mode 100644
index 0000000..7ee92ec
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicli-invalid.s
@@ -0,0 +1,232 @@
+# Xqcicli - Qualcomm uC Conditional Load Immediate Instructions
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcicli < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-PLUS %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcicli < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-MINUS %s
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.lieq x0, x4, x6, 10
+
+# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction
+qc.lieq x2, x0, x6, 10
+
+# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction
+qc.lieq x2, x4, x0, 10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lieq x2, x4, x6
+
+# CHECK-PLUS: :[[@LINE+1]]:21: error: immediate must be an integer in the range [-16, 15]
+qc.lieq x2, x4, x6, 40
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lieq x2, x4, x6, 10
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.lige x0, x8, x20, 2
+
+# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction
+qc.lige x4, x0, x20, 2
+
+# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction
+qc.lige x4, x8, x0, 2
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lige x4, x8, x20
+
+# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15]
+qc.lige x4, x8, x20, -18
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lige x4, x8, x20, 2
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.lilt x0, x9, x10, 3
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.lilt x19, x0, x10, 3
+
+# CHECK: :[[@LINE+1]]:18: error: invalid operand for instruction
+qc.lilt x19, x9, x0, 3
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lilt x19, x9, x10
+
+# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15]
+qc.lilt x19, x9, x10, 39
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lilt x19, x9, x10, 3
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.line x0, x14, x6, 10
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.line x18, x0, x6, 10
+
+# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction
+qc.line x18, x14, x0, 10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.line x18, x14, x6
+
+# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15]
+qc.line x18, x14, x6, 100
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.line x18, x14, x6, 10
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.ligeu x0, x4, x6, 10
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.ligeu x2, x0, x6, 10
+
+# CHECK: :[[@LINE+1]]:18: error: invalid operand for instruction
+qc.ligeu x2, x4, x0, 10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.ligeu x2, x4, x6
+
+# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15]
+qc.ligeu x2, x4, x6, 70
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.ligeu x2, x4, x6, 10
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.liltu x0, x19, x12, 13
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.liltu x1, x0, x12, 13
+
+# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction
+qc.liltu x1, x19, x0, 13
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.liltu x1, x19, x12
+
+# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15]
+qc.liltu x1, x19, x12, 73
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.liltu x1, x19, x12, 13
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.lieqi x0, x1, 15, 12
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.lieqi x7, x0, 15, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lieqi x7, x1, 15
+
+# CHECK-PLUS: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15]
+qc.lieqi x7, x1, 25, 12
+
+# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15]
+qc.lieqi x7, x1, 15, -22
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lieqi x7, x1, 15, 12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.ligei x0, x11, -4, 9
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.ligei x17, x0, -4, 9
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.ligei x17, x11, -4
+
+# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [-16, 15]
+qc.ligei x17, x11, -24, 9
+
+# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15]
+qc.ligei x17, x11, -4, 59
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.ligei x17, x11, -4, 9
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.lilti x0, x11, -14, 2
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.lilti x9, x0, -14, 2
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lilti x9, x11, -14
+
+# CHECK-PLUS: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.lilti x9, x11, -84, 2
+
+# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15]
+qc.lilti x9, x11, -14, 52
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lilti x9, x11, -14, 2
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.linei x0, x1, 10, 12
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.linei x5, x0, 10, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.linei x5, x1, 10
+
+# CHECK-PLUS: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15]
+qc.linei x5, x1, 130, 12
+
+# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15]
+qc.linei x5, x1, 10, 124
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.linei x5, x1, 10, 12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.ligeui x0, x12, 7, -12
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.ligeui x2, x0, 7, -12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.ligeui x2, x12, 7
+
+# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.ligeui x2, x12, -7, -12
+
+# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15]
+qc.ligeui x2, x12, 7, -17
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.ligeui x2, x12, 7, -12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.liltui x0, x25, 31, 12
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.liltui x3, x0, 31, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.liltui x3, x25, 31
+
+# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.liltui x3, x25, 32, 12
+
+# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15]
+qc.liltui x3, x25, 31, 112
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.liltui x3, x25, 31, 12
diff --git a/llvm/test/MC/RISCV/xqcicli-valid.s b/llvm/test/MC/RISCV/xqcicli-valid.s
new file mode 100644
index 0000000..404bfdf
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicli-valid.s
@@ -0,0 +1,59 @@
+# Xqcicli - Qualcomm uC Conditional Load Immediate Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicli -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicli < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicli -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicli -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicli < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicli --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.lieq   sp, tp, t1, 10
+# CHECK-ENC: encoding: [0x5b,0x01,0x62,0x52]
+qc.lieq x2, x4, x6, 10
+
+# CHECK-INST: qc.lieqi  t2, ra, 15, 12
+# CHECK-ENC: encoding: [0xdb,0x83,0xf0,0x66]
+qc.lieqi x7, x1, 15, 12
+
+# CHECK-INST: qc.lige   tp, s0, s4, 2
+# CHECK-ENC: encoding: [0x5b,0x52,0x44,0x13]
+qc.lige x4, x8, x20, 2
+
+# CHECK-INST: qc.ligei  a7, a1, -4, 9
+# CHECK-ENC: encoding: [0xdb,0xd8,0xc5,0x4f]
+qc.ligei x17, x11, -4, 9
+
+# CHECK-INST: qc.ligeu  sp, tp, t1, 10
+# CHECK-ENC: encoding: [0x5b,0x71,0x62,0x52]
+qc.ligeu x2, x4, x6, 10
+
+# CHECK-INST: qc.ligeui sp, a2, 7, -12
+# CHECK-ENC: encoding: [0x5b,0x71,0x76,0xa6]
+qc.ligeui x2, x12, 7, -12
+
+# CHECK-INST: qc.lilt   s3, s1, a0, 3
+# CHECK-ENC: encoding: [0xdb,0xc9,0xa4,0x1a]
+qc.lilt x19, x9, x10, 3
+
+# CHECK-INST: qc.lilti  s1, a1, -14, 2
+# CHECK-ENC: encoding: [0xdb,0xc4,0x25,0x17]
+qc.lilti x9, x11, -14, 2
+
+# CHECK-INST: qc.liltu  ra, s3, a2, 13
+# CHECK-ENC: encoding: [0xdb,0xe0,0xc9,0x6a]
+qc.liltu x1, x19, x12, 13
+
+# CHECK-INST: qc.liltui gp, s9, 31, 12
+# CHECK-ENC: encoding: [0xdb,0xe1,0xfc,0x67]
+qc.liltui x3, x25, 31, 12
+
+# CHECK-INST: qc.line   s2, a4, t1, 10
+# CHECK-ENC: encoding: [0x5b,0x19,0x67,0x52]
+qc.line x18, x14, x6, 10
+
+# CHECK-INST: qc.linei  t0, ra, 10, 12
+# CHECK-ENC: encoding: [0xdb,0x92,0xa0,0x66]
+qc.linei x5, x1, 10, 12
diff --git a/llvm/test/MC/RISCV/xqcicm-invalid.s b/llvm/test/MC/RISCV/xqcicm-invalid.s
new file mode 100644
index 0000000..8b37ed4
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicm-invalid.s
@@ -0,0 +1,152 @@
+# Xqcicm - Qualcomm uC Conditional Move Extension
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcicm < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-IMM %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcicm < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
+
+# CHECK: :[[@LINE+1]]:12: error: invalid operand for instruction
+qc.c.mveqz 9, x10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.mveqz x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.c.mveqz x9, x10
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mveq 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mveq x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mveq x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvge 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvge x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvge x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvgeu 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgeu x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgeu x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvlt 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvlt x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvlt x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvltu 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvltu x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvltu x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvne 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvne x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvne x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mveqi 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mveqi x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mveqi x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mveqi x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvgei 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgei x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvgei x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgei x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvlti 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvlti x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvlti x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvlti x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvnei 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvnei x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvnei x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvnei x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.mvltui 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvltui x9
+
+# CHECK-IMM: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.mvltui x9, x10, 37, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvltui x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.mvgeui 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgeui x9
+
+# CHECK-IMM: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.mvgeui x9, x10, 37, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgeui x9, x10, 5, x12
diff --git a/llvm/test/MC/RISCV/xqcicm-valid.s b/llvm/test/MC/RISCV/xqcicm-valid.s
new file mode 100644
index 0000000..7d0050b
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicm-valid.s
@@ -0,0 +1,123 @@
+# Xqcicm - Qualcomm uC Conditional Move Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicm -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicm < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicm -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicm -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicm < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicm --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.c.mveqz      s1, a0
+# CHECK-ENC: encoding: [0x06,0xad]
+qc.c.mveqz x9, x10
+
+
+# CHECK-INST: qc.mveq s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0xb5,0x60]
+qc.mveq x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvge s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0xb5,0x60]
+qc.mvge x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvgeu        s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0xb5,0x60]
+qc.mvgeu x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvlt s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0xb5,0x60]
+qc.mvlt x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvltu        s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0xb5,0x60]
+qc.mvltu x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvne s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0xb5,0x60]
+qc.mvne x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mveqi        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0x55,0x64]
+qc.mveqi x9, x10, 5, x12
+
+# CHECK-INST: qc.mveqi        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0x05,0x65]
+qc.mveqi x9, x10, -16, x12
+
+# CHECK-INST: qc.mveqi        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0xf5,0x64]
+qc.mveqi x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvgei        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0x55,0x64]
+qc.mvgei x9, x10, 5, x12
+
+# CHECK-INST: qc.mvgei        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0x05,0x65]
+qc.mvgei x9, x10, -16, x12
+
+# CHECK-INST: qc.mvgei        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0xf5,0x64]
+qc.mvgei x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvlti        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0x55,0x64]
+qc.mvlti x9, x10, 5, x12
+
+# CHECK-INST: qc.mvlti        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0x05,0x65]
+qc.mvlti x9, x10, -16, x12
+
+# CHECK-INST: qc.mvlti        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0xf5,0x64]
+qc.mvlti x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvnei        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0x55,0x64]
+qc.mvnei x9, x10, 5, x12
+
+# CHECK-INST: qc.mvnei        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0x05,0x65]
+qc.mvnei x9, x10, -16, x12
+
+# CHECK-INST: qc.mvnei        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0xf5,0x64]
+qc.mvnei x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvltui       s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0x55,0x64]
+qc.mvltui x9, x10, 5, x12
+
+# CHECK-INST: qc.mvltui       s1, a0, 0, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0x05,0x64]
+qc.mvltui x9, x10, 0, x12
+
+# CHECK-INST: qc.mvltui       s1, a0, 31, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0xf5,0x65]
+qc.mvltui x9, x10, 31, x12
+
+
+# CHECK-INST: qc.mvgeui       s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0x55,0x64]
+qc.mvgeui x9, x10, 5, x12
+
+# CHECK-INST: qc.mvgeui       s1, a0, 0, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0x05,0x64]
+qc.mvgeui x9, x10, 0, x12
+
+# CHECK-INST: qc.mvgeui       s1, a0, 31, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0xf5,0x65]
+qc.mvgeui x9, x10, 31, x12
diff --git a/llvm/test/MC/RISCV/xqcics-valid.s b/llvm/test/MC/RISCV/xqcics-valid.s
index eb888a6..1438f67 100644
--- a/llvm/test/MC/RISCV/xqcics-valid.s
+++ b/llvm/test/MC/RISCV/xqcics-valid.s
@@ -1,5 +1,5 @@
 # Xqcics - Qualcomm uC Conditional Select Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcics -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcics -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcics < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcics -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqcicsr-valid.s b/llvm/test/MC/RISCV/xqcicsr-valid.s
index 1236dd6..ab26098 100644
--- a/llvm/test/MC/RISCV/xqcicsr-valid.s
+++ b/llvm/test/MC/RISCV/xqcicsr-valid.s
@@ -1,5 +1,5 @@
 # Xqcicsr - Qualcomm uC CSR Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicsr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicsr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicsr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcicsr -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s b/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s
index e9aec14..b65a831 100644
--- a/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s
+++ b/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s
@@ -1,5 +1,5 @@
 # Xqcilsm - Qualcomm uC Load Store Multiple Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilsm < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcilsm -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqcilsm-valid.s b/llvm/test/MC/RISCV/xqcilsm-valid.s
index 4893e07..cbe25a2 100644
--- a/llvm/test/MC/RISCV/xqcilsm-valid.s
+++ b/llvm/test/MC/RISCV/xqcilsm-valid.s
@@ -1,5 +1,5 @@
 # Xqcilsm - Qualcomm uC Load Store Multiple Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilsm < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcilsm -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqcisls-valid.s b/llvm/test/MC/RISCV/xqcisls-valid.s
index 32f64a8..d7e80b3 100644
--- a/llvm/test/MC/RISCV/xqcisls-valid.s
+++ b/llvm/test/MC/RISCV/xqcisls-valid.s
@@ -1,5 +1,5 @@
 # Xqcisls - Qualcomm uC Scaled Load Store Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcisls -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcisls -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcisls < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcisls -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xsifive-valid.s b/llvm/test/MC/RISCV/xsifive-valid.s
index 8aa0ab1..bf59981 100644
--- a/llvm/test/MC/RISCV/xsifive-valid.s
+++ b/llvm/test/MC/RISCV/xsifive-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease < %s \
 # RUN:     | llvm-objdump --mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/xwchc-compress.s b/llvm/test/MC/RISCV/xwchc-compress.s
index 4bdce1c..7964497 100644
--- a/llvm/test/MC/RISCV/xwchc-compress.s
+++ b/llvm/test/MC/RISCV/xwchc-compress.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+xwchc --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/xwchc-valid.s b/llvm/test/MC/RISCV/xwchc-valid.s
index 292a042..51767941 100644
--- a/llvm/test/MC/RISCV/xwchc-valid.s
+++ b/llvm/test/MC/RISCV/xwchc-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+xwchc -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+xwchc -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xwchc < %s \
 # RUN:     | llvm-objdump --mattr=+xwchc --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/zfa-double-invalid.s b/llvm/test/MC/RISCV/zfa-double-invalid.s
index ec21b0c..f28bd5c 100644
--- a/llvm/test/MC/RISCV/zfa-double-invalid.s
+++ b/llvm/test/MC/RISCV/zfa-double-invalid.s
@@ -1,8 +1,8 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTD %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTD %s
 
 # CHECK-NO-EXTD: error: instruction requires the following: 'D' (Double-Precision Floating-Point){{$}}
diff --git a/llvm/test/MC/RISCV/zfa-half-invalid.s b/llvm/test/MC/RISCV/zfa-half-invalid.s
index a2c6f09..debaf71 100644
--- a/llvm/test/MC/RISCV/zfa-half-invalid.s
+++ b/llvm/test/MC/RISCV/zfa-half-invalid.s
@@ -1,8 +1,8 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+d \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTZFH %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+d \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTZFH %s
 
 # CHECK-NO-EXTZFH: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point){{$}}
diff --git a/llvm/test/MC/RISCV/zfa-valid.s b/llvm/test/MC/RISCV/zfa-valid.s
index e951c9d..6e78a4c 100644
--- a/llvm/test/MC/RISCV/zfa-valid.s
+++ b/llvm/test/MC/RISCV/zfa-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \
@@ -10,10 +10,10 @@
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+d,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: fli.s ft1, -1.0
diff --git a/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s b/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
index 6b5dc92..a7a16d5 100644
--- a/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
+++ b/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+zfhmin,+zvfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfa,+zfhmin,+zvfh -M no-aliases -d -r - \
@@ -9,9 +9,9 @@
 # RUN:     | llvm-objdump --mattr=+zfa,+zfhmin,+zvfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # This test makes sure fli.h is supported with Zvfh.
diff --git a/llvm/test/MC/RISCV/zicfilp-invalid.s b/llvm/test/MC/RISCV/zicfilp-invalid.s
index 5b22c0a..bff989f 100644
--- a/llvm/test/MC/RISCV/zicfilp-invalid.s
+++ b/llvm/test/MC/RISCV/zicfilp-invalid.s
@@ -1,6 +1,6 @@
-# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zicfilp -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zicfilp -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-NO-EXT: immediate must be an integer in the range [0, 1048575]
diff --git a/llvm/test/MC/RISCV/zicfilp-valid.s b/llvm/test/MC/RISCV/zicfilp-valid.s
index 308e9b6..f61cad8 100644
--- a/llvm/test/MC/RISCV/zicfilp-valid.s
+++ b/llvm/test/MC/RISCV/zicfilp-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfilp -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfilp -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicfilp < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zicfilp --no-print-imm-hex -d -r - \
@@ -9,9 +9,9 @@
 # RUN:     | llvm-objdump --mattr=+experimental-zicfilp --no-print-imm-hex -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: lpad 22
diff --git a/llvm/test/MC/RISCV/zicfiss-valid.s b/llvm/test/MC/RISCV/zicfiss-valid.s
index fd69d37..5b2ab8d 100644
--- a/llvm/test/MC/RISCV/zicfiss-valid.s
+++ b/llvm/test/MC/RISCV/zicfiss-valid.s
@@ -1,17 +1,17 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zicfiss -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zicfiss -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+experimental-zicfiss < %s \
 # RUN:     | llvm-objdump --mattr=+a,+experimental-zicfiss -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM-RV64,CHECK-ASM,CHECK-ASM-AND-OBJ-RV64,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss < %s \
 # RUN:     | llvm-objdump --mattr=+a,+experimental-zicfiss -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ-RV64,CHECK-ASM-AND-OBJ %s
 #
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-RV64 %s
 
 # CHECK-ASM-AND-OBJ: sspopchk ra
diff --git a/llvm/test/TableGen/ContextlessPredicates.td b/llvm/test/TableGen/GlobalISelEmitter/ContextlessPredicates.td
index eead965..fa3484e 100644
--- a/llvm/test/TableGen/ContextlessPredicates.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/ContextlessPredicates.td
@@ -1,6 +1,6 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/context-non-optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/context-non-optimized.cpp
 // RUN: FileCheck %s --check-prefixes=CHECK_NOPT -input-file=%T/context-non-optimized.cpp
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=true  %s -o %T/context-optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=true  %s -o %T/context-optimized.cpp
 // RUN: FileCheck %s --check-prefixes=CHECK_OPT -input-file=%T/context-optimized.cpp
 
 
diff --git a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td b/llvm/test/TableGen/GlobalISelEmitter/CustomPredicate.td
index 3ceadf3..56eaa4b 100644
--- a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/CustomPredicate.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s
 
 // Verify that all MI predicates are enumerated.
 //
diff --git a/llvm/test/TableGen/DefaultOpsGlobalISel.td b/llvm/test/TableGen/GlobalISelEmitter/DefaultOpsGlobalISel.td
index 8f4176a..f88045c 100644
--- a/llvm/test/TableGen/DefaultOpsGlobalISel.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/DefaultOpsGlobalISel.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common -o - | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterFlags.td b/llvm/test/TableGen/GlobalISelEmitter/Flags.td
index fa8f2a7..0878955 100644
--- a/llvm/test/TableGen/GlobalISelEmitterFlags.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/Flags.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
index ffefaba..7c81814 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
@@ -1,6 +1,6 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/non-optimized.cpp
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=true  %s -o %T/optimized.cpp
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o %T/default.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/non-optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=true  %s -o %T/optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s -o %T/default.cpp
 
 // RUN: FileCheck %s --check-prefixes=CHECK,R19C,R19N -input-file=%T/non-optimized.cpp
 // RUN: FileCheck %s --check-prefixes=CHECK,R19C,R19O -input-file=%T/optimized.cpp
diff --git a/llvm/test/TableGen/GlobalISelEmitterHwModes.td b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
index 9d235f5..3588ba3 100644
--- a/llvm/test/TableGen/GlobalISelEmitterHwModes.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/hwmode-non-optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/hwmode-non-optimized.cpp
 // RUN: FileCheck %s --check-prefixes=CHECK -input-file=%T/hwmode-non-optimized.cpp
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td
index 3db31be..c430725 100644
--- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand-invalid.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand-invalid.td
index d93805b..18ae767 100644
--- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand-invalid.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand-invalid.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand.td
index 1ac3399..c6ca9b7 100644
--- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterOverloadedPtr.td b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td
index 422edbb..31accba 100644
--- a/llvm/test/TableGen/GlobalISelEmitterOverloadedPtr.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td
@@ -1,13 +1,35 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o - | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 // Boilerplate code.
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
 
+def GPR : RegisterClass<"MyTarget", [i32, i64], 32, (add R0)>;
+
 let TargetPrefix = "mytarget" in {
     def int_mytarget_anyptr : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty]>;
 }
 
+// Check that iPTR in the destination DAG doesn't prevent the pattern from being imported.
+
+// CHECK: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
+// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/0,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID),
+// CHECK-NEXT: // (ld:{ *:[i32] } GPR:{ *:[iPTR] }:$src1)<<P:Predicate_unindexedload>><<P:Predicate_load>>  =>  (ANYLOAD:{ *:[i32] } GPR:{ *:[iPTR] }:$src1)
+// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::ANYLOAD),
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage, 0,
+// CHECK-NEXT: GIR_Done,
+
+let hasSideEffects = 1 in {
+  def ANYLOAD : I<(outs GPR32:$dst), (ins GPR:$src1),
+                  [(set GPR32:$dst, (load GPR:$src1))]>;
+}
+
 // Ensure that llvm_anyptr_ty on an intrinsic results in a
 // GIM_CheckPointerToAny rather than a GIM_CheckType.
 //
@@ -20,10 +42,6 @@ let TargetPrefix = "mytarget" in {
 // CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_frag_anyptr),
 // CHECK-NEXT: // (intrinsic_w_chain:{ *:[i32] } {{[0-9]+}}:{ *:[iPTR] }, GPR32:{ *:[i32] }:$src)<<P:Predicate_frag_anyptr>>  =>  (ANYLOAD:{ *:[i32] } GPR32:{ *:[i32] }:$src)
 // CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ANYLOAD),
-let hasSideEffects = 1 in {
-  def ANYLOAD : I<(outs GPR32:$dst), (ins GPR32:$src1),
-            [(set GPR32:$dst, (load GPR32:$src1))]>;
-}
 
 def frag_anyptr : PatFrag<(ops node:$src),
                    (int_mytarget_anyptr node:$src),
diff --git a/llvm/test/TableGen/GlobalISelEmitter-PR39045.td b/llvm/test/TableGen/GlobalISelEmitter/PR39045.td
index 5407222..595fa92 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-PR39045.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/PR39045.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common  %s -o %t
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common  %s -o %t
 // RUN: FileCheck %s < %t
 
 // Both predicates should be tested
diff --git a/llvm/test/TableGen/GlobalISelEmitterRegSequence.td b/llvm/test/TableGen/GlobalISelEmitter/RegSequence.td
index 69f82ea..97790fb 100644
--- a/llvm/test/TableGen/GlobalISelEmitterRegSequence.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/RegSequence.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td b/llvm/test/TableGen/GlobalISelEmitter/SDNodeXForm-timm.td
index 8d6dedf..fab395d 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/SDNodeXForm-timm.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td b/llvm/test/TableGen/GlobalISelEmitter/SkippedPatterns.td
index fc8abc6..9d9d3f9 100644
--- a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/SkippedPatterns.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../include %s -I %p/Common -o /dev/null 2>&1 | FileCheck %s
-// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../include %s -I %p/Common -o /dev/null -DIGNORE 2>&1 | FileCheck --allow-empty --check-prefix=IGNORED %s
+// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../../include %s -I %p/../Common -o /dev/null 2>&1 | FileCheck %s
+// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../../include %s -I %p/../Common -o /dev/null -DIGNORE 2>&1 | FileCheck --allow-empty --check-prefix=IGNORED %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterSubreg.td b/llvm/test/TableGen/GlobalISelEmitter/Subreg.td
index 08e690f..5203c2b 100644
--- a/llvm/test/TableGen/GlobalISelEmitterSubreg.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/Subreg.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common -o - 2> %t.skipped | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common 2> %t.skipped | FileCheck %s
 // RUN: cat %t.skipped | FileCheck %s --check-prefix=SKIPPED
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterVariadic.td b/llvm/test/TableGen/GlobalISelEmitter/Variadic.td
index 992e1a4..b3c8052 100644
--- a/llvm/test/TableGen/GlobalISelEmitterVariadic.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/Variadic.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o - | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-atomic_store.td b/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td
index da2dfe8..53b8670 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-atomic_store.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter/dead-def.td b/llvm/test/TableGen/GlobalISelEmitter/dead-def.td
new file mode 100644
index 0000000..a8597f1
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/dead-def.td
@@ -0,0 +1,27 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false \
+// RUN:   -I %p/../../../include -I %p/../Common %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+// Check that $same_name from I2 isn't copied to the root instruction.
+
+def I1 : I<(outs GPR32:$same_name), (ins GPR32:$rs), []>;
+def I2 : I<(outs GPR32:$other_name, GPR32:$same_name), (ins GPR32:$rs), []>;
+
+def : Pat<(abs i32:$x), (I1 (I2 $x))>;
+
+// CHECK-LABEL: // (abs:{ *:[i32] } i32:{ *:[i32] }:$x)  =>  (I1:{ *:[i32] } (I2:{ *:[i32] }:{ *:[i32] } ?:{ *:[i32] }:$x))
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/1, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(MyTarget::I2),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/GIMT_Encode2(RegState::Define),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/1, /*TempRegFlags*/GIMT_Encode2(RegState::Define|RegState::Dead),
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/1, // x
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/1,
+// CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::I1),
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[same_name]
+// CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage, 0,
+// CHECK-NEXT: GIR_EraseRootFromParent_Done,
diff --git a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td b/llvm/test/TableGen/GlobalISelEmitter/frameindex.td
index 715e53d..2778452 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/frameindex.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td
new file mode 100644
index 0000000..1f1b557
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td
@@ -0,0 +1,167 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s
+
+include "llvm/Target/Target.td"
+
+def TestTargetInstrInfo : InstrInfo;
+
+def TestTarget : Target {
+  let InstructionSet = TestTargetInstrInfo;
+}
+
+def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
+def SPECIAL : Register<"special"> { let Namespace = "MyTarget"; }
+def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
+def Special32 : RegisterClass<"MyTarget", [i32], 32, (add SPECIAL)>;
+
+
+class I<dag OOps, dag IOps, list<dag> Pat>
+  : Instruction {
+  let Namespace = "MyTarget";
+  let OutOperandList = OOps;
+  let InOperandList = IOps;
+  let Pattern = Pat;
+}
+
+// Try a nested physical register
+
+// GISEL: GIM_Try,
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// GISEL-NEXT: // MIs[0] src0
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] Operand 1
+// GISEL-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32,
+// GISEL-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_MUL),
+// GISEL-NEXT: // MIs[1] Operand 0
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: // MIs[1] src1
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[1] Operand 2
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
+// GISEL-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1,
+// GISEL-NEXT: // (st GPR32:{ *:[i32] }:$src0, (mul:{ *:[i32] } GPR32:{ *:[i32] }:$src1, SPECIAL:{ *:[i32] }))  =>  (MULM_PHYS GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/1, /*OpIdx*/2, // SPECIAL
+// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MULM_PHYS),
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // src0
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1
+// GISEL-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1,
+// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
+// GISEL-NEXT: // GIR_Coverage, 0,
+// GISEL-NEXT: GIR_EraseRootFromParent_Done,
+def MULM_PHYS : I<(outs), (ins GPR32:$src0, GPR32:$src1),
+    [(st GPR32:$src0, (mul GPR32:$src1, SPECIAL))]> {
+  let Uses = [SPECIAL];
+}
+
+// Try nested physical registers and check on duplicated copies
+
+// GISEL: GIM_Try,
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// GISEL-NEXT: // MIs[0] src0
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] Operand 1
+// GISEL-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32,
+// GISEL-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_MUL),
+// GISEL-NEXT: // MIs[1] Operand 0
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: // MIs[1] Operand 1
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[1] Operand 2
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
+// GISEL-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1,
+// GISEL-NEXT: // (st GPR32:{ *:[i32] }:$src0, (mul:{ *:[i32] } R0:{ *:[i32] }, SPECIAL:{ *:[i32] }))  =>  (MULMR0_PHYS GPR32:{ *:[i32] }:$src0)
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/2, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/2, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/2, /*OldInsnID*/1, /*OpIdx*/2, // SPECIAL
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::R0), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/1, /*OpIdx*/1, // R0
+// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MULMR0_PHYS),
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // src0
+// GISEL-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1,
+// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
+// GISEL-NEXT: // GIR_Coverage, 1,
+// GISEL-NEXT: GIR_EraseRootFromParent_Done,
+def MULMR0_PHYS : I<(outs), (ins GPR32:$src0),
+    [(st GPR32:$src0, (mul R0, SPECIAL))]> {
+  let Uses = [R0, SPECIAL];
+}
+
+// Try a normal physical register use.
+
+// GISEL: GIM_Try,
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_ADD),
+// GISEL-NEXT: // MIs[0] DstI[dst]
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] src0
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] Operand 2
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
+// GISEL-NEXT: // (add:{ *:[i32] } GPR32:{ *:[i32] }:$src0, SPECIAL:{ *:[i32] })  =>  (ADD_PHYS:{ *:[i32] } GPR32:{ *:[i32] }:$src0)
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/2, // SPECIAL
+// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ADD_PHYS),
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // src0
+// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
+// GISEL-NEXT: // GIR_Coverage, 2,
+// GISEL-NEXT: GIR_EraseRootFromParent_Done,
+def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0),
+    [(set GPR32:$dst, (add GPR32:$src0, SPECIAL))]> {
+  let Uses = [SPECIAL];
+}
+
+// Try using the name of the physreg in another operand.
+
+// GISEL: GIM_Try,
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_MUL),
+// GISEL-NEXT: // MIs[0] DstI[dst]
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] SPECIAL
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] Operand 2
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
+// GISEL-NEXT: // (mul:{ *:[i32] } GPR32:{ *:[i32] }:$SPECIAL, SPECIAL:{ *:[i32] })  =>  (MUL_PHYS:{ *:[i32] } GPR32:{ *:[i32] }:$SPECIAL)
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/2, // SPECIAL
+// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MUL_PHYS),
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // SPECIAL
+// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
+// GISEL-NEXT: // GIR_Coverage, 3,
+// GISEL-NEXT: GIR_EraseRootFromParent_Done,
+def MUL_PHYS : I<(outs GPR32:$dst), (ins GPR32:$SPECIAL),
+    [(set GPR32:$dst, (mul GPR32:$SPECIAL, SPECIAL))]> {
+  let Uses = [SPECIAL];
+}
+
+// Try giving the physical operand a name
+// def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0),
+//     [(set GPR32:$dst, (add GPR32:$src0, SPECIAL:$special))]> {
+//   let Uses = [SPECIAL];
+// }
diff --git a/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td b/llvm/test/TableGen/GlobalISelEmitter/immAllZeroOne.td
index 0125aa5..68278f4 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/immAllZeroOne.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefixes=GISEL-NOOPT %s
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=true -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefixes=GISEL-OPT %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefixes=GISEL-NOOPT %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=true -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefixes=GISEL-OPT %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td b/llvm/test/TableGen/GlobalISelEmitter/immarg-literal-pattern.td
index 6b4012e..ff05ac1 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/immarg-literal-pattern.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/immarg-predicated.td b/llvm/test/TableGen/GlobalISelEmitter/immarg-predicated.td
index dcacb2f..ab412fa 100644
--- a/llvm/test/TableGen/immarg-predicated.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/immarg-predicated.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/Common -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../Common -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/immarg.td b/llvm/test/TableGen/GlobalISelEmitter/immarg.td
index e5fd06c..eae0409 100644
--- a/llvm/test/TableGen/immarg.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/immarg.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/Common -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../Common -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td b/llvm/test/TableGen/GlobalISelEmitter/implicit-defs.td
index 79af1a3..06e5e39 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/implicit-defs.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o /dev/null 2>&1 < %s | FileCheck %s --implicit-check-not="Skipped pattern"
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not="Skipped pattern"
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-input-discard.td b/llvm/test/TableGen/GlobalISelEmitter/input-discard.td
index 202ff4a..65ebfa2 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-input-discard.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/input-discard.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-multiple-output-discard.td b/llvm/test/TableGen/GlobalISelEmitter/multiple-output-discard.td
index 2d968bebb..a180431 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-multiple-output-discard.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/multiple-output-discard.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td b/llvm/test/TableGen/GlobalISelEmitter/multiple-output.td
index dea3b54..baf7675 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/multiple-output.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-nested-subregs.td b/llvm/test/TableGen/GlobalISelEmitter/nested-subregs.td
index 79e55ef..8688e4f 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-nested-subregs.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/nested-subregs.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-notype-output-pattern.td b/llvm/test/TableGen/GlobalISelEmitter/notype-output-pattern.td
index 622d7fa..80b1256 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-notype-output-pattern.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/notype-output-pattern.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-optional-def.td b/llvm/test/TableGen/GlobalISelEmitter/optional-def.td
index def4a04..7792a97e 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-optional-def.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/optional-def.td
@@ -1,5 +1,5 @@
 // RUN: llvm-tblgen -gen-global-isel  -warn-on-skipped-patterns \
-// RUN:   -I %p/../../include -I %p/Common %s 2> %t | FileCheck %s
+// RUN:   -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck %s
 // RUN: FileCheck -DFILE=%s -check-prefix=ERR %s < %t
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-output-discard.td b/llvm/test/TableGen/GlobalISelEmitter/output-discard.td
index 7a0242d..c249dcb 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-output-discard.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/output-discard.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-setcc.td b/llvm/test/TableGen/GlobalISelEmitter/setcc.td
index 38add76..02622d0 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-setcc.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/setcc.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - 2> %t < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck -check-prefix=GISEL %s
 // RUN: FileCheck -DFILE=%s -check-prefix=ERR %s < %t
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td
new file mode 100644
index 0000000..323aea9
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td
@@ -0,0 +1,35 @@
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns \
+// RUN:   -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck %s
+// RUN: FileCheck -check-prefix=ERR %s < %t
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def undef_tied_1 : OperandWithDefaultOps<untyped, (ops (i32 undef_tied_input))> {
+  let MIOperandInfo = (ops GPR32:$inactive);
+}
+
+def undef_tied_2 : OperandWithDefaultOps<i32, (ops (untyped undef_tied_input))> {
+  let MIOperandInfo = (ops GPR32:$inactive);
+}
+
+let Constraints = "$opt.inactive = $rd" in
+def I1 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied_1:$opt),
+           [(set GPR32:$rd, (abs i32:$rs))]>;
+
+// ERR: [[#@LINE+2]]:5: warning: Skipped pattern: unsupported type
+let Constraints = "$opt.inactive = $rd" in
+def I2 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied_2:$opt),
+           [(set GPR32:$rd, (abs i32:$rs))]>;
+
+// CHECK-LABEL: // (abs:{ *:[i32] } i32:{ *:[i32] }:$rs)  =>  (I1:{ *:[i32] } i32:{ *:[i32] }:$rs)
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::IMPLICIT_DEF),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/GIMT_Encode2(RegState::Define),
+// CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::I1),
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[rd]
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // rs
+// CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage, 0,
+// CHECK-NEXT: GIR_EraseRootFromParent_Done,
diff --git a/llvm/test/TableGen/GlobalISelEmitter-zero-instr.td b/llvm/test/TableGen/GlobalISelEmitter/zero-instr.td
index c8a8cab..f9463ba 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-zero-instr.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/zero-instr.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o /dev/null --warn-on-skipped-patterns 2>&1 < %s 2>&1 | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s -o /dev/null --warn-on-skipped-patterns 2>&1 | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-zero-reg.td b/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td
index ddf0224..87e5432 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-zero-reg.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/gisel-physreg-input.td b/llvm/test/TableGen/gisel-physreg-input.td
deleted file mode 100644
index f19872a..0000000
--- a/llvm/test/TableGen/gisel-physreg-input.td
+++ /dev/null
@@ -1,87 +0,0 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s
-
-include "llvm/Target/Target.td"
-
-def TestTargetInstrInfo : InstrInfo;
-
-def TestTarget : Target {
-  let InstructionSet = TestTargetInstrInfo;
-}
-
-def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
-def SPECIAL : Register<"special"> { let Namespace = "MyTarget"; }
-def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
-def Special32 : RegisterClass<"MyTarget", [i32], 32, (add SPECIAL)>;
-
-
-class I<dag OOps, dag IOps, list<dag> Pat>
-  : Instruction {
-  let Namespace = "MyTarget";
-  let OutOperandList = OOps;
-  let InOperandList = IOps;
-  let Pattern = Pat;
-}
-
-// Try a normal physical register use.
-
-// GISEL: GIM_Try,
-// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
-// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_ADD),
-// GISEL-NEXT: // MIs[0] DstI[dst]
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
-// GISEL-NEXT: // MIs[0] src0
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
-// GISEL-NEXT: // MIs[0] Operand 2
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
-// GISEL-NEXT: // (add:{ *:[i32] } GPR32:{ *:[i32] }:$src0, SPECIAL:{ *:[i32] })  =>  (ADD_PHYS:{ *:[i32] } GPR32:{ *:[i32] }:$src0)
-// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
-// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
-// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/2, // SPECIAL
-// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ADD_PHYS),
-// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
-// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // src0
-// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
-// GISEL-NEXT: // GIR_Coverage, 0,
-// GISEL-NEXT: GIR_EraseRootFromParent_Done,
-def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0),
-    [(set GPR32:$dst, (add GPR32:$src0, SPECIAL))]> {
-  let Uses = [SPECIAL];
-}
-
-// Try using the name of the physreg in another operand.
-
-// GISEL: GIM_Try,
-// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
-// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_MUL),
-// GISEL-NEXT: // MIs[0] DstI[dst]
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
-// GISEL-NEXT: // MIs[0] SPECIAL
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
-// GISEL-NEXT: // MIs[0] Operand 2
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
-// GISEL-NEXT: // (mul:{ *:[i32] } GPR32:{ *:[i32] }:$SPECIAL, SPECIAL:{ *:[i32] })  =>  (MUL_PHYS:{ *:[i32] } GPR32:{ *:[i32] }:$SPECIAL)
-// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
-// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
-// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/2, // SPECIAL
-// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MUL_PHYS),
-// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
-// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // SPECIAL
-// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
-// GISEL-NEXT: // GIR_Coverage, 1,
-// GISEL-NEXT: GIR_EraseRootFromParent_Done,
-def MUL_PHYS : I<(outs GPR32:$dst), (ins GPR32:$SPECIAL),
-    [(set GPR32:$dst, (mul GPR32:$SPECIAL, SPECIAL))]> {
-  let Uses = [SPECIAL];
-}
-
-// Try giving the physical operand a name
-// def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0),
-//     [(set GPR32:$dst, (add GPR32:$src0, SPECIAL:$special))]> {
-//   let Uses = [SPECIAL];
-// }
diff --git a/llvm/test/TableGen/template-args.td b/llvm/test/TableGen/template-args.td
index f3eb02d..1644b0a1 100644
--- a/llvm/test/TableGen/template-args.td
+++ b/llvm/test/TableGen/template-args.td
@@ -9,6 +9,7 @@
 // RUN: not llvm-tblgen -DERROR8 %s 2>&1 | FileCheck --check-prefix=ERROR8 %s
 // RUN: not llvm-tblgen -DERROR9 %s 2>&1 | FileCheck --check-prefix=ERROR9 %s
 // RUN: not llvm-tblgen -DERROR10 %s 2>&1 | FileCheck --check-prefix=ERROR10 %s
+// RUN: not llvm-tblgen -DERROR11 %s 2>&1 | FileCheck --check-prefix=ERROR11 %s
 
 // This file tests that all required arguments are specified and template
 // arguments are type-checked and cast if necessary.
@@ -158,13 +159,13 @@ defm MissingComma : TwoArgs<2 "two">;
 #ifdef ERROR8
 def error8: Class1;
 // ERROR8: value not specified for template argument 'Class1:nm'
-// ERROR8: 18:21: note: declared in 'Class1'
+// ERROR8: 19:21: note: declared in 'Class1'
 #endif
 
 #ifdef ERROR9
 defm error9: MC1;
 // ERROR9: value not specified for template argument 'MC1::nm'
-// ERROR9: 99:23: note: declared in 'MC1'
+// ERROR9: 100:23: note: declared in 'MC1'
 #endif
 
 #ifdef ERROR10
@@ -172,5 +173,15 @@ def error10 {
   int value = Class2<>.Code;
 }
 // ERROR10: value not specified for template argument 'Class2:cd'
-// ERROR10: 37:22: note: declared in 'Class2'
+// ERROR10: 38:22: note: declared in 'Class2'
+#endif
+
+#ifdef ERROR11
+
+class Foo<int i, int j>;
+
+def error11 : Foo<"", "">;
+// ERROR11: [[#@LINE-1]]:19: error: Value specified for template argument 'Foo:i' is of type string; expected type int: ""
+// ERROR11: [[#@LINE-2]]:23: error: Value specified for template argument 'Foo:j' is of type string; expected type int: ""
+
 #endif
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 36f6afa..954c05b 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -3085,9 +3085,12 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VMINMAXPSZ128rri, X86::VMINMAXPSZ128rmi, 0},
   {X86::VMINMAXPSZ256rri, X86::VMINMAXPSZ256rmi, 0},
   {X86::VMINMAXPSZrri, X86::VMINMAXPSZrmi, 0},
-  {X86::VMINMAXSDrri, X86::VMINMAXSDrmi, TB_NO_REVERSE},
-  {X86::VMINMAXSHrri, X86::VMINMAXSHrmi, TB_NO_REVERSE},
-  {X86::VMINMAXSSrri, X86::VMINMAXSSrmi, TB_NO_REVERSE},
+  {X86::VMINMAXSDrri, X86::VMINMAXSDrmi, 0},
+  {X86::VMINMAXSDrri_Int, X86::VMINMAXSDrmi_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSHrri, X86::VMINMAXSHrmi, 0},
+  {X86::VMINMAXSHrri_Int, X86::VMINMAXSHrmi_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSSrri, X86::VMINMAXSSrmi, 0},
+  {X86::VMINMAXSSrri_Int, X86::VMINMAXSSrmi_Int, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rr, X86::VMINPBF16Z128rm, 0},
   {X86::VMINPBF16Z256rr, X86::VMINPBF16Z256rm, 0},
   {X86::VMINPBF16Zrr, X86::VMINPBF16Zrm, 0},
@@ -4236,9 +4239,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0},
   {X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0},
   {X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0},
-  {X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VADDSHZrr_Intkz, X86::VADDSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VADDSDZrrkz_Int, X86::VADDSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VADDSHZrrkz_Int, X86::VADDSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VADDSSZrrkz_Int, X86::VADDSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0},
   {X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0},
   {X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0},
@@ -4285,9 +4288,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0},
   {X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0},
   {X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0},
-  {X86::VCMPSDZrri_Intk, X86::VCMPSDZrmi_Intk, TB_NO_REVERSE},
-  {X86::VCMPSHZrri_Intk, X86::VCMPSHZrmi_Intk, TB_NO_REVERSE},
-  {X86::VCMPSSZrri_Intk, X86::VCMPSSZrmi_Intk, TB_NO_REVERSE},
+  {X86::VCMPSDZrrik_Int, X86::VCMPSDZrmik_Int, TB_NO_REVERSE},
+  {X86::VCMPSHZrrik_Int, X86::VCMPSHZrmik_Int, TB_NO_REVERSE},
+  {X86::VCMPSSZrrik_Int, X86::VCMPSSZrmik_Int, TB_NO_REVERSE},
   {X86::VCVT2PS2PHXZ128rrkz, X86::VCVT2PS2PHXZ128rmkz, 0},
   {X86::VCVT2PS2PHXZ256rrkz, X86::VCVT2PS2PHXZ256rmkz, 0},
   {X86::VCVT2PS2PHXZrrkz, X86::VCVT2PS2PHXZrmkz, 0},
@@ -4435,12 +4438,12 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmk, 0},
   {X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmk, 0},
   {X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmk, 0},
-  {X86::VCVTSD2SHZrr_Intkz, X86::VCVTSD2SHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSD2SSZrr_Intkz, X86::VCVTSD2SSZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSH2SDZrr_Intkz, X86::VCVTSH2SDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSH2SSZrr_Intkz, X86::VCVTSH2SSZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSS2SDZrr_Intkz, X86::VCVTSS2SDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSS2SHZrr_Intkz, X86::VCVTSS2SHZrm_Intkz, TB_NO_REVERSE},
+  {X86::VCVTSD2SHZrrkz_Int, X86::VCVTSD2SHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSD2SSZrrkz_Int, X86::VCVTSD2SSZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSH2SDZrrkz_Int, X86::VCVTSH2SDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSH2SSZrrkz_Int, X86::VCVTSH2SSZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSS2SDZrrkz_Int, X86::VCVTSS2SDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSS2SHZrrkz_Int, X86::VCVTSS2SHZrmkz_Int, TB_NO_REVERSE},
   {X86::VCVTTNEBF162IBSZ128rrk, X86::VCVTTNEBF162IBSZ128rmk, 0},
   {X86::VCVTTNEBF162IBSZ256rrk, X86::VCVTTNEBF162IBSZ256rmk, 0},
   {X86::VCVTTNEBF162IBSZrrk, X86::VCVTTNEBF162IBSZrmk, 0},
@@ -4564,9 +4567,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0},
   {X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0},
   {X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0},
-  {X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VDIVSHZrr_Intkz, X86::VDIVSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VDIVSDZrrkz_Int, X86::VDIVSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VDIVSHZrrkz_Int, X86::VDIVSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VDIVSSZrrkz_Int, X86::VDIVSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0},
   {X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0},
   {X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0},
@@ -5107,9 +5110,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0},
   {X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0},
   {X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0},
-  {X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMAXSHZrr_Intkz, X86::VMAXSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VMAXSDZrrkz_Int, X86::VMAXSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMAXSHZrrkz_Int, X86::VMAXSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMAXSSZrrkz_Int, X86::VMAXSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0},
   {X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0},
   {X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0},
@@ -5131,9 +5134,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMINMAXPSZ128rrikz, X86::VMINMAXPSZ128rmikz, 0},
   {X86::VMINMAXPSZ256rrikz, X86::VMINMAXPSZ256rmikz, 0},
   {X86::VMINMAXPSZrrikz, X86::VMINMAXPSZrmikz, 0},
-  {X86::VMINMAXSDrrikz, X86::VMINMAXSDrmikz, TB_NO_REVERSE},
-  {X86::VMINMAXSHrrikz, X86::VMINMAXSHrmikz, TB_NO_REVERSE},
-  {X86::VMINMAXSSrrikz, X86::VMINMAXSSrmikz, TB_NO_REVERSE},
+  {X86::VMINMAXSDrrikz_Int, X86::VMINMAXSDrmikz_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSHrrikz_Int, X86::VMINMAXSHrmikz_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSSrrikz_Int, X86::VMINMAXSSrmikz_Int, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rrkz, X86::VMINPBF16Z128rmkz, 0},
   {X86::VMINPBF16Z256rrkz, X86::VMINPBF16Z256rmkz, 0},
   {X86::VMINPBF16Zrrkz, X86::VMINPBF16Zrmkz, 0},
@@ -5146,9 +5149,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0},
   {X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0},
   {X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0},
-  {X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMINSHZrr_Intkz, X86::VMINSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VMINSDZrrkz_Int, X86::VMINSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMINSHZrrkz_Int, X86::VMINSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMINSSZrrkz_Int, X86::VMINSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VMOVAPDZ128rrk, X86::VMOVAPDZ128rmk, TB_NO_REVERSE|TB_ALIGN_16},
   {X86::VMOVAPDZ256rrk, X86::VMOVAPDZ256rmk, TB_NO_REVERSE|TB_ALIGN_32},
   {X86::VMOVAPDZrrk, X86::VMOVAPDZrmk, TB_NO_REVERSE|TB_ALIGN_64},
@@ -5203,9 +5206,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0},
   {X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0},
   {X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0},
-  {X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMULSHZrr_Intkz, X86::VMULSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VMULSDZrrkz_Int, X86::VMULSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMULSHZrrkz_Int, X86::VMULSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMULSSZrrkz_Int, X86::VMULSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0},
   {X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0},
   {X86::VORPDZrrkz, X86::VORPDZrmkz, 0},
@@ -5969,9 +5972,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VRNDSCALEPSZ128rrik, X86::VRNDSCALEPSZ128rmik, 0},
   {X86::VRNDSCALEPSZ256rrik, X86::VRNDSCALEPSZ256rmik, 0},
   {X86::VRNDSCALEPSZrrik, X86::VRNDSCALEPSZrmik, 0},
-  {X86::VRNDSCALESDZrri_Intkz, X86::VRNDSCALESDZrmi_Intkz, TB_NO_REVERSE},
-  {X86::VRNDSCALESHZrri_Intkz, X86::VRNDSCALESHZrmi_Intkz, TB_NO_REVERSE},
-  {X86::VRNDSCALESSZrri_Intkz, X86::VRNDSCALESSZrmi_Intkz, TB_NO_REVERSE},
+  {X86::VRNDSCALESDZrrikz_Int, X86::VRNDSCALESDZrmikz_Int, TB_NO_REVERSE},
+  {X86::VRNDSCALESHZrrikz_Int, X86::VRNDSCALESHZrmikz_Int, TB_NO_REVERSE},
+  {X86::VRNDSCALESSZrrikz_Int, X86::VRNDSCALESSZrmikz_Int, TB_NO_REVERSE},
   {X86::VRSQRT14PDZ128rk, X86::VRSQRT14PDZ128mk, 0},
   {X86::VRSQRT14PDZ256rk, X86::VRSQRT14PDZ256mk, 0},
   {X86::VRSQRT14PDZrk, X86::VRSQRT14PDZmk, 0},
@@ -6035,9 +6038,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mk, 0},
   {X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mk, 0},
   {X86::VSQRTPSZrk, X86::VSQRTPSZmk, 0},
-  {X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE},
-  {X86::VSQRTSHZr_Intkz, X86::VSQRTSHZm_Intkz, TB_NO_REVERSE},
-  {X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE},
+  {X86::VSQRTSDZrkz_Int, X86::VSQRTSDZmkz_Int, TB_NO_REVERSE},
+  {X86::VSQRTSHZrkz_Int, X86::VSQRTSHZmkz_Int, TB_NO_REVERSE},
+  {X86::VSQRTSSZrkz_Int, X86::VSQRTSSZmkz_Int, TB_NO_REVERSE},
   {X86::VSUBNEPBF16Z128rrkz, X86::VSUBNEPBF16Z128rmkz, 0},
   {X86::VSUBNEPBF16Z256rrkz, X86::VSUBNEPBF16Z256rmkz, 0},
   {X86::VSUBNEPBF16Zrrkz, X86::VSUBNEPBF16Zrmkz, 0},
@@ -6050,9 +6053,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0},
   {X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0},
   {X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0},
-  {X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VSUBSHZrr_Intkz, X86::VSUBSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VSUBSDZrrkz_Int, X86::VSUBSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VSUBSHZrrkz_Int, X86::VSUBSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VSUBSSZrrkz_Int, X86::VSUBSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0},
   {X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0},
   {X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0},
@@ -6086,9 +6089,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0},
   {X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0},
   {X86::VADDPSZrrk, X86::VADDPSZrmk, 0},
-  {X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VADDSHZrr_Intk, X86::VADDSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VADDSDZrrk_Int, X86::VADDSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VADDSHZrrk_Int, X86::VADDSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VADDSSZrrk_Int, X86::VADDSSZrmk_Int, TB_NO_REVERSE},
   {X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0},
   {X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0},
   {X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0},
@@ -6137,12 +6140,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0},
   {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0},
   {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0},
-  {X86::VCVTSD2SHZrr_Intk, X86::VCVTSD2SHZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSH2SDZrr_Intk, X86::VCVTSH2SDZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSH2SSZrr_Intk, X86::VCVTSH2SSZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSS2SHZrr_Intk, X86::VCVTSS2SHZrm_Intk, TB_NO_REVERSE},
+  {X86::VCVTSD2SHZrrk_Int, X86::VCVTSD2SHZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSD2SSZrrk_Int, X86::VCVTSD2SSZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSH2SDZrrk_Int, X86::VCVTSH2SDZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSH2SSZrrk_Int, X86::VCVTSH2SSZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSS2SDZrrk_Int, X86::VCVTSS2SDZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSS2SHZrrk_Int, X86::VCVTSS2SHZrmk_Int, TB_NO_REVERSE},
   {X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0},
   {X86::VDBPSADBWZ256rrik, X86::VDBPSADBWZ256rmik, 0},
   {X86::VDBPSADBWZrrik, X86::VDBPSADBWZrmik, 0},
@@ -6158,9 +6161,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0},
   {X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0},
   {X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0},
-  {X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VDIVSHZrr_Intk, X86::VDIVSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VDIVSDZrrk_Int, X86::VDIVSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VDIVSHZrrk_Int, X86::VDIVSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VDIVSSZrrk_Int, X86::VDIVSSZrmk_Int, TB_NO_REVERSE},
   {X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0},
   {X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0},
   {X86::VDPBF16PSZ256rk, X86::VDPBF16PSZ256mk, 0},
@@ -6225,12 +6228,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADD132PSZ256rkz, X86::VFMADD132PSZ256mkz, 0},
   {X86::VFMADD132PSZrk, X86::VFMADD132PSZmk, 0},
   {X86::VFMADD132PSZrkz, X86::VFMADD132PSZmkz, 0},
-  {X86::VFMADD132SDZr_Intk, X86::VFMADD132SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD132SDZr_Intkz, X86::VFMADD132SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD132SHZr_Intk, X86::VFMADD132SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD132SHZr_Intkz, X86::VFMADD132SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMADD132SDZrk_Int, X86::VFMADD132SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SDZrkz_Int, X86::VFMADD132SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SHZrk_Int, X86::VFMADD132SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SHZrkz_Int, X86::VFMADD132SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SSZrk_Int, X86::VFMADD132SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SSZrkz_Int, X86::VFMADD132SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMADD213NEPBF16Z128rk, X86::VFMADD213NEPBF16Z128mk, 0},
   {X86::VFMADD213NEPBF16Z128rkz, X86::VFMADD213NEPBF16Z128mkz, 0},
   {X86::VFMADD213NEPBF16Z256rk, X86::VFMADD213NEPBF16Z256mk, 0},
@@ -6255,12 +6258,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADD213PSZ256rkz, X86::VFMADD213PSZ256mkz, 0},
   {X86::VFMADD213PSZrk, X86::VFMADD213PSZmk, 0},
   {X86::VFMADD213PSZrkz, X86::VFMADD213PSZmkz, 0},
-  {X86::VFMADD213SDZr_Intk, X86::VFMADD213SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD213SDZr_Intkz, X86::VFMADD213SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD213SHZr_Intk, X86::VFMADD213SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD213SHZr_Intkz, X86::VFMADD213SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMADD213SDZrk_Int, X86::VFMADD213SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SDZrkz_Int, X86::VFMADD213SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SHZrk_Int, X86::VFMADD213SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SHZrkz_Int, X86::VFMADD213SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SSZrk_Int, X86::VFMADD213SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SSZrkz_Int, X86::VFMADD213SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMADD231NEPBF16Z128rk, X86::VFMADD231NEPBF16Z128mk, 0},
   {X86::VFMADD231NEPBF16Z128rkz, X86::VFMADD231NEPBF16Z128mkz, 0},
   {X86::VFMADD231NEPBF16Z256rk, X86::VFMADD231NEPBF16Z256mk, 0},
@@ -6285,12 +6288,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADD231PSZ256rkz, X86::VFMADD231PSZ256mkz, 0},
   {X86::VFMADD231PSZrk, X86::VFMADD231PSZmk, 0},
   {X86::VFMADD231PSZrkz, X86::VFMADD231PSZmkz, 0},
-  {X86::VFMADD231SDZr_Intk, X86::VFMADD231SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD231SDZr_Intkz, X86::VFMADD231SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD231SHZr_Intk, X86::VFMADD231SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD231SHZr_Intkz, X86::VFMADD231SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMADD231SDZrk_Int, X86::VFMADD231SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SDZrkz_Int, X86::VFMADD231SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SHZrk_Int, X86::VFMADD231SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SHZrkz_Int, X86::VFMADD231SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SSZrk_Int, X86::VFMADD231SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SSZrkz_Int, X86::VFMADD231SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMADDCPHZ128rk, X86::VFMADDCPHZ128mk, 0},
   {X86::VFMADDCPHZ128rkz, X86::VFMADDCPHZ128mkz, 0},
   {X86::VFMADDCPHZ256rk, X86::VFMADDCPHZ256mk, 0},
@@ -6377,12 +6380,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMSUB132PSZ256rkz, X86::VFMSUB132PSZ256mkz, 0},
   {X86::VFMSUB132PSZrk, X86::VFMSUB132PSZmk, 0},
   {X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmkz, 0},
-  {X86::VFMSUB132SDZr_Intk, X86::VFMSUB132SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB132SDZr_Intkz, X86::VFMSUB132SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB132SHZr_Intk, X86::VFMSUB132SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB132SHZr_Intkz, X86::VFMSUB132SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMSUB132SDZrk_Int, X86::VFMSUB132SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SDZrkz_Int, X86::VFMSUB132SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SHZrk_Int, X86::VFMSUB132SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SHZrkz_Int, X86::VFMSUB132SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SSZrk_Int, X86::VFMSUB132SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SSZrkz_Int, X86::VFMSUB132SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMSUB213NEPBF16Z128rk, X86::VFMSUB213NEPBF16Z128mk, 0},
   {X86::VFMSUB213NEPBF16Z128rkz, X86::VFMSUB213NEPBF16Z128mkz, 0},
   {X86::VFMSUB213NEPBF16Z256rk, X86::VFMSUB213NEPBF16Z256mk, 0},
@@ -6407,12 +6410,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMSUB213PSZ256rkz, X86::VFMSUB213PSZ256mkz, 0},
   {X86::VFMSUB213PSZrk, X86::VFMSUB213PSZmk, 0},
   {X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmkz, 0},
-  {X86::VFMSUB213SDZr_Intk, X86::VFMSUB213SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB213SDZr_Intkz, X86::VFMSUB213SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB213SHZr_Intk, X86::VFMSUB213SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB213SHZr_Intkz, X86::VFMSUB213SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMSUB213SDZrk_Int, X86::VFMSUB213SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SDZrkz_Int, X86::VFMSUB213SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SHZrk_Int, X86::VFMSUB213SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SHZrkz_Int, X86::VFMSUB213SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SSZrk_Int, X86::VFMSUB213SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SSZrkz_Int, X86::VFMSUB213SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMSUB231NEPBF16Z128rk, X86::VFMSUB231NEPBF16Z128mk, 0},
   {X86::VFMSUB231NEPBF16Z128rkz, X86::VFMSUB231NEPBF16Z128mkz, 0},
   {X86::VFMSUB231NEPBF16Z256rk, X86::VFMSUB231NEPBF16Z256mk, 0},
@@ -6437,12 +6440,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMSUB231PSZ256rkz, X86::VFMSUB231PSZ256mkz, 0},
   {X86::VFMSUB231PSZrk, X86::VFMSUB231PSZmk, 0},
   {X86::VFMSUB231PSZrkz, X86::VFMSUB231PSZmkz, 0},
-  {X86::VFMSUB231SDZr_Intk, X86::VFMSUB231SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB231SDZr_Intkz, X86::VFMSUB231SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB231SHZr_Intk, X86::VFMSUB231SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB231SHZr_Intkz, X86::VFMSUB231SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB231SSZr_Intk, X86::VFMSUB231SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB231SSZr_Intkz, X86::VFMSUB231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMSUB231SDZrk_Int, X86::VFMSUB231SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SDZrkz_Int, X86::VFMSUB231SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SHZrk_Int, X86::VFMSUB231SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SHZrkz_Int, X86::VFMSUB231SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SSZrk_Int, X86::VFMSUB231SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SSZrkz_Int, X86::VFMSUB231SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMSUBADD132PDZ128rk, X86::VFMSUBADD132PDZ128mk, 0},
   {X86::VFMSUBADD132PDZ128rkz, X86::VFMSUBADD132PDZ128mkz, 0},
   {X86::VFMSUBADD132PDZ256rk, X86::VFMSUBADD132PDZ256mk, 0},
@@ -6525,12 +6528,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD132PSZ256rkz, X86::VFNMADD132PSZ256mkz, 0},
   {X86::VFNMADD132PSZrk, X86::VFNMADD132PSZmk, 0},
   {X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmkz, 0},
-  {X86::VFNMADD132SDZr_Intk, X86::VFNMADD132SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD132SDZr_Intkz, X86::VFNMADD132SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD132SHZr_Intk, X86::VFNMADD132SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD132SHZr_Intkz, X86::VFNMADD132SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMADD132SDZrk_Int, X86::VFNMADD132SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SDZrkz_Int, X86::VFNMADD132SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SHZrk_Int, X86::VFNMADD132SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SHZrkz_Int, X86::VFNMADD132SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SSZrk_Int, X86::VFNMADD132SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SSZrkz_Int, X86::VFNMADD132SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMADD213NEPBF16Z128rk, X86::VFNMADD213NEPBF16Z128mk, 0},
   {X86::VFNMADD213NEPBF16Z128rkz, X86::VFNMADD213NEPBF16Z128mkz, 0},
   {X86::VFNMADD213NEPBF16Z256rk, X86::VFNMADD213NEPBF16Z256mk, 0},
@@ -6555,12 +6558,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD213PSZ256rkz, X86::VFNMADD213PSZ256mkz, 0},
   {X86::VFNMADD213PSZrk, X86::VFNMADD213PSZmk, 0},
   {X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmkz, 0},
-  {X86::VFNMADD213SDZr_Intk, X86::VFNMADD213SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD213SDZr_Intkz, X86::VFNMADD213SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD213SHZr_Intk, X86::VFNMADD213SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD213SHZr_Intkz, X86::VFNMADD213SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMADD213SDZrk_Int, X86::VFNMADD213SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SDZrkz_Int, X86::VFNMADD213SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SHZrk_Int, X86::VFNMADD213SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SHZrkz_Int, X86::VFNMADD213SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SSZrk_Int, X86::VFNMADD213SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SSZrkz_Int, X86::VFNMADD213SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMADD231NEPBF16Z128rk, X86::VFNMADD231NEPBF16Z128mk, 0},
   {X86::VFNMADD231NEPBF16Z128rkz, X86::VFNMADD231NEPBF16Z128mkz, 0},
   {X86::VFNMADD231NEPBF16Z256rk, X86::VFNMADD231NEPBF16Z256mk, 0},
@@ -6585,12 +6588,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD231PSZ256rkz, X86::VFNMADD231PSZ256mkz, 0},
   {X86::VFNMADD231PSZrk, X86::VFNMADD231PSZmk, 0},
   {X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmkz, 0},
-  {X86::VFNMADD231SDZr_Intk, X86::VFNMADD231SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD231SDZr_Intkz, X86::VFNMADD231SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD231SHZr_Intk, X86::VFNMADD231SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD231SHZr_Intkz, X86::VFNMADD231SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMADD231SDZrk_Int, X86::VFNMADD231SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SDZrkz_Int, X86::VFNMADD231SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SHZrk_Int, X86::VFNMADD231SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SHZrkz_Int, X86::VFNMADD231SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SSZrk_Int, X86::VFNMADD231SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SSZrkz_Int, X86::VFNMADD231SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMSUB132NEPBF16Z128rk, X86::VFNMSUB132NEPBF16Z128mk, 0},
   {X86::VFNMSUB132NEPBF16Z128rkz, X86::VFNMSUB132NEPBF16Z128mkz, 0},
   {X86::VFNMSUB132NEPBF16Z256rk, X86::VFNMSUB132NEPBF16Z256mk, 0},
@@ -6615,12 +6618,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMSUB132PSZ256rkz, X86::VFNMSUB132PSZ256mkz, 0},
   {X86::VFNMSUB132PSZrk, X86::VFNMSUB132PSZmk, 0},
   {X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmkz, 0},
-  {X86::VFNMSUB132SDZr_Intk, X86::VFNMSUB132SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB132SDZr_Intkz, X86::VFNMSUB132SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB132SHZr_Intk, X86::VFNMSUB132SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB132SHZr_Intkz, X86::VFNMSUB132SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB132SDZrk_Int, X86::VFNMSUB132SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SDZrkz_Int, X86::VFNMSUB132SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SHZrk_Int, X86::VFNMSUB132SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SHZrkz_Int, X86::VFNMSUB132SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SSZrk_Int, X86::VFNMSUB132SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SSZrkz_Int, X86::VFNMSUB132SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMSUB213NEPBF16Z128rk, X86::VFNMSUB213NEPBF16Z128mk, 0},
   {X86::VFNMSUB213NEPBF16Z128rkz, X86::VFNMSUB213NEPBF16Z128mkz, 0},
   {X86::VFNMSUB213NEPBF16Z256rk, X86::VFNMSUB213NEPBF16Z256mk, 0},
@@ -6645,12 +6648,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMSUB213PSZ256rkz, X86::VFNMSUB213PSZ256mkz, 0},
   {X86::VFNMSUB213PSZrk, X86::VFNMSUB213PSZmk, 0},
   {X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmkz, 0},
-  {X86::VFNMSUB213SDZr_Intk, X86::VFNMSUB213SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB213SDZr_Intkz, X86::VFNMSUB213SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB213SHZr_Intk, X86::VFNMSUB213SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB213SHZr_Intkz, X86::VFNMSUB213SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB213SDZrk_Int, X86::VFNMSUB213SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SDZrkz_Int, X86::VFNMSUB213SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SHZrk_Int, X86::VFNMSUB213SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SHZrkz_Int, X86::VFNMSUB213SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SSZrk_Int, X86::VFNMSUB213SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SSZrkz_Int, X86::VFNMSUB213SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMSUB231NEPBF16Z128rk, X86::VFNMSUB231NEPBF16Z128mk, 0},
   {X86::VFNMSUB231NEPBF16Z128rkz, X86::VFNMSUB231NEPBF16Z128mkz, 0},
   {X86::VFNMSUB231NEPBF16Z256rk, X86::VFNMSUB231NEPBF16Z256mk, 0},
@@ -6675,12 +6678,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMSUB231PSZ256rkz, X86::VFNMSUB231PSZ256mkz, 0},
   {X86::VFNMSUB231PSZrk, X86::VFNMSUB231PSZmk, 0},
   {X86::VFNMSUB231PSZrkz, X86::VFNMSUB231PSZmkz, 0},
-  {X86::VFNMSUB231SDZr_Intk, X86::VFNMSUB231SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB231SDZr_Intkz, X86::VFNMSUB231SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB231SHZr_Intk, X86::VFNMSUB231SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB231SHZr_Intkz, X86::VFNMSUB231SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB231SDZrk_Int, X86::VFNMSUB231SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SDZrkz_Int, X86::VFNMSUB231SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SHZrk_Int, X86::VFNMSUB231SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SHZrkz_Int, X86::VFNMSUB231SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SSZrk_Int, X86::VFNMSUB231SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SSZrkz_Int, X86::VFNMSUB231SSZmkz_Int, TB_NO_REVERSE},
   {X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE},
   {X86::VGETEXPSHZrk, X86::VGETEXPSHZmk, TB_NO_REVERSE},
   {X86::VGETEXPSSZrk, X86::VGETEXPSSZmk, TB_NO_REVERSE},
@@ -6729,9 +6732,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0},
   {X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0},
   {X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0},
-  {X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VMAXSHZrr_Intk, X86::VMAXSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VMAXSDZrrk_Int, X86::VMAXSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VMAXSHZrrk_Int, X86::VMAXSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VMAXSSZrrk_Int, X86::VMAXSSZrmk_Int, TB_NO_REVERSE},
   {X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0},
   {X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0},
   {X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0},
@@ -6753,9 +6756,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMINMAXPSZ128rrik, X86::VMINMAXPSZ128rmik, 0},
   {X86::VMINMAXPSZ256rrik, X86::VMINMAXPSZ256rmik, 0},
   {X86::VMINMAXPSZrrik, X86::VMINMAXPSZrmik, 0},
-  {X86::VMINMAXSDrrik, X86::VMINMAXSDrmik, TB_NO_REVERSE},
-  {X86::VMINMAXSHrrik, X86::VMINMAXSHrmik, TB_NO_REVERSE},
-  {X86::VMINMAXSSrrik, X86::VMINMAXSSrmik, TB_NO_REVERSE},
+  {X86::VMINMAXSDrrik_Int, X86::VMINMAXSDrmik_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSHrrik_Int, X86::VMINMAXSHrmik_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSSrrik_Int, X86::VMINMAXSSrmik_Int, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rrk, X86::VMINPBF16Z128rmk, 0},
   {X86::VMINPBF16Z256rrk, X86::VMINPBF16Z256rmk, 0},
   {X86::VMINPBF16Zrrk, X86::VMINPBF16Zrmk, 0},
@@ -6768,9 +6771,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0},
   {X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0},
   {X86::VMINPSZrrk, X86::VMINPSZrmk, 0},
-  {X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VMINSHZrr_Intk, X86::VMINSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VMINSDZrrk_Int, X86::VMINSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VMINSHZrrk_Int, X86::VMINSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VMINSSZrrk_Int, X86::VMINSSZrmk_Int, TB_NO_REVERSE},
   {X86::VMPSADBWZ128rrik, X86::VMPSADBWZ128rmik, 0},
   {X86::VMPSADBWZ256rrik, X86::VMPSADBWZ256rmik, 0},
   {X86::VMPSADBWZrrik, X86::VMPSADBWZrmik, 0},
@@ -6786,9 +6789,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0},
   {X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0},
   {X86::VMULPSZrrk, X86::VMULPSZrmk, 0},
-  {X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VMULSHZrr_Intk, X86::VMULSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VMULSDZrrk_Int, X86::VMULSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VMULSHZrrk_Int, X86::VMULSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VMULSSZrrk_Int, X86::VMULSSZrmk_Int, TB_NO_REVERSE},
   {X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0},
   {X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0},
   {X86::VORPDZrrk, X86::VORPDZrmk, 0},
@@ -7344,9 +7347,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VREDUCESDZrrik, X86::VREDUCESDZrmik, TB_NO_REVERSE},
   {X86::VREDUCESHZrrik, X86::VREDUCESHZrmik, TB_NO_REVERSE},
   {X86::VREDUCESSZrrik, X86::VREDUCESSZrmik, TB_NO_REVERSE},
-  {X86::VRNDSCALESDZrri_Intk, X86::VRNDSCALESDZrmi_Intk, TB_NO_REVERSE},
-  {X86::VRNDSCALESHZrri_Intk, X86::VRNDSCALESHZrmi_Intk, TB_NO_REVERSE},
-  {X86::VRNDSCALESSZrri_Intk, X86::VRNDSCALESSZrmi_Intk, TB_NO_REVERSE},
+  {X86::VRNDSCALESDZrrik_Int, X86::VRNDSCALESDZrmik_Int, TB_NO_REVERSE},
+  {X86::VRNDSCALESHZrrik_Int, X86::VRNDSCALESHZrmik_Int, TB_NO_REVERSE},
+  {X86::VRNDSCALESSZrrik_Int, X86::VRNDSCALESSZrmik_Int, TB_NO_REVERSE},
   {X86::VRSQRT14SDZrrk, X86::VRSQRT14SDZrmk, TB_NO_REVERSE},
   {X86::VRSQRT14SSZrrk, X86::VRSQRT14SSZrmk, TB_NO_REVERSE},
   {X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE},
@@ -7381,9 +7384,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0},
   {X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0},
   {X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0},
-  {X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE},
-  {X86::VSQRTSHZr_Intk, X86::VSQRTSHZm_Intk, TB_NO_REVERSE},
-  {X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE},
+  {X86::VSQRTSDZrk_Int, X86::VSQRTSDZmk_Int, TB_NO_REVERSE},
+  {X86::VSQRTSHZrk_Int, X86::VSQRTSHZmk_Int, TB_NO_REVERSE},
+  {X86::VSQRTSSZrk_Int, X86::VSQRTSSZmk_Int, TB_NO_REVERSE},
   {X86::VSUBNEPBF16Z128rrk, X86::VSUBNEPBF16Z128rmk, 0},
   {X86::VSUBNEPBF16Z256rrk, X86::VSUBNEPBF16Z256rmk, 0},
   {X86::VSUBNEPBF16Zrrk, X86::VSUBNEPBF16Zrmk, 0},
@@ -7396,9 +7399,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0},
   {X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0},
   {X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0},
-  {X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VSUBSHZrr_Intk, X86::VSUBSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VSUBSDZrrk_Int, X86::VSUBSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VSUBSHZrrk_Int, X86::VSUBSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VSUBSSZrrk_Int, X86::VSUBSSZrmk_Int, TB_NO_REVERSE},
   {X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0},
   {X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0},
   {X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0},
diff --git a/llvm/test/ThinLTO/X86/memprof-recursive.ll b/llvm/test/ThinLTO/X86/memprof-recursive.ll
new file mode 100644
index 0000000..2b1d708
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-recursive.ll
@@ -0,0 +1,141 @@
+;; Test recursion handling during cloning.
+;;
+;; See llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll for
+;; information on how the test was created.
+
+; RUN: opt -thinlto-bc %s >%t.o
+
+;; By default we should enable cloning of contexts involved with recursive
+;; cycles, but not through the cycle itself. I.e. until full support for
+;; recursion is added, the cloned recursive call from C back to B (line 12) will
+;; not be updated to call a clone.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS
+
+;; Skipping recursive callsites should result in no cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-callsites=false \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --allow-empty \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="marked with memprof allocation attribute cold"
+
+;; Skipping recursive contexts should prevent spurious call to cloned version of
+;; B from the context starting at memprof_recursive.cc:19:13, which is actually
+;; recursive (until that support is added).
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-contexts=false \
+; RUN:  -o %t.out 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS
+
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1
+;; We should only call the cold clone for the recursive context if we enabled
+;; recursive contexts via -memprof-allow-recursive-contexts=true (default).
+; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; SKIP-RECUR-CONTEXTS-NOT: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Dv() !dbg !3 {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !dbg !6, !memprof !7, !callsite !14
+  ret ptr null
+}
+
+define ptr @_Z1Ci(i32 %n) !dbg !15 {
+entry:
+  %call = tail call ptr @_Z1Dv(), !dbg !16, !callsite !17
+  br label %return
+
+if.end:                                           ; No predecessors!
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !18, !callsite !19
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Bi(i32 %n) !dbg !20 {
+entry:
+  %call = tail call ptr @_Z1Ci(i32 0), !dbg !21, !callsite !22
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = tail call ptr @_Z1Bi(i32 0), !dbg !23, !callsite !25
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !26, !callsite !27
+  %call2 = tail call ptr @_Z1Bi(i32 0), !dbg !28, !callsite !29
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 7aec6dc477f8148ed066d10dfc7a012a51b6599c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_recursive.cc", directory: ".", checksumkind: CSK_MD5, checksum: "2f15f63b187a0e0d40e7fdd18b10576a")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "D", linkageName: "_Z1Dv", scope: !1, file: !1, line: 4, type: !4, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 5, column: 10, scope: !3)
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 6307901912192269588}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 8632435727821051414}
+!12 = !{!13, !"cold"}
+!13 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 -3421689549917153178}
+!14 = !{i64 6541423618768552252}
+!15 = distinct !DISubprogram(name: "C", linkageName: "_Z1Ci", scope: !1, file: !1, line: 8, type: !4, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DILocation(line: 10, column: 12, scope: !15)
+!17 = !{i64 -200552803509692312}
+!18 = !DILocation(line: 12, column: 10, scope: !15)
+!19 = !{i64 -7155190423157709404}
+!20 = distinct !DISubprogram(name: "B", linkageName: "_Z1Bi", scope: !1, file: !1, line: 14, type: !4, scopeLine: 14, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = !DILocation(line: 15, column: 10, scope: !20)
+!22 = !{i64 -2954124005641725917}
+!23 = !DILocation(line: 18, column: 13, scope: !24)
+!24 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !4, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = !{i64 8632435727821051414}
+!26 = !DILocation(line: 19, column: 13, scope: !24)
+!27 = !{i64 -3421689549917153178}
+!28 = !DILocation(line: 20, column: 13, scope: !24)
+!29 = !{i64 6307901912192269588}
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
new file mode 100644
index 0000000..52257c1
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -S < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP3]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index 366446a..93872c3 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -381,26 +381,58 @@ define void @pr65222(i32 %flags, i1 %cmp, i1 %tobool.not) {
 ; CHECK:       then:
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT:%.*]], label [[COND1_SI_UNFOLD_TRUE:%.*]], label [[COND_SI_UNFOLD_TRUE:%.*]]
 ; CHECK:       cond.si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP]], label [[TOUNFOLD_SI_UNFOLD_FALSE1:%.*]], label [[COND_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK:       cond.si.unfold.true.jt2:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI:%.*]] = phi i32 [ 2, [[THEN]] ]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[TOUNFOLD_SI_UNFOLD_FALSE:%.*]], label [[COND_SI_UNFOLD_FALSE:%.*]]
 ; CHECK:       cond.si.unfold.false:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 0, [[COND_SI_UNFOLD_TRUE]] ]
-; CHECK-NEXT:    br label [[TOUNFOLD_SI_UNFOLD_FALSE]]
+; CHECK-NEXT:    br label [[TOUNFOLD_SI_UNFOLD_FALSE1]]
+; CHECK:       cond.si.unfold.false.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1_JT0:%.*]] = phi i32 [ 0, [[COND_SI_UNFOLD_TRUE1:%.*]] ]
+; CHECK-NEXT:    br label [[TOUNFOLD_SI_UNFOLD_FALSE_JT0:%.*]]
 ; CHECK:       tounfold.si.unfold.false:
-; CHECK-NEXT:    [[COND_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI]], [[COND_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI1]], [[COND_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[COND_SI_UNFOLD_PHI:%.*]] = phi i32 [ poison, [[COND_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI1]], [[COND_SI_UNFOLD_FALSE]] ]
 ; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       tounfold.si.unfold.false.jt0:
+; CHECK-NEXT:    [[COND_SI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI1_JT0]], [[COND_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[IF_END_JT0:%.*]]
+; CHECK:       tounfold.si.unfold.false.jt2:
+; CHECK-NEXT:    [[COND_SI_UNFOLD_PHI_JT2:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI]], [[COND_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    br label [[IF_END_JT2:%.*]]
 ; CHECK:       cond1.si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END]], label [[COND1_SI_UNFOLD_FALSE_JT1:%.*]]
+; CHECK:       cond1.si.unfold.true.jt3:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI2:%.*]] = phi i32 [ 3, [[THEN]] ]
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END]], label [[COND1_SI_UNFOLD_FALSE:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END_JT3:%.*]], label [[COND1_SI_UNFOLD_FALSE:%.*]]
 ; CHECK:       cond1.si.unfold.false:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI3:%.*]] = phi i32 [ 1, [[COND1_SI_UNFOLD_TRUE]] ]
 ; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       cond1.si.unfold.false.jt1:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI3_JT1:%.*]] = phi i32 [ 1, [[COND1_SI_UNFOLD_TRUE1:%.*]] ]
+; CHECK-NEXT:    br label [[IF_END_JT1:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[UNFOLDED:%.*]] = phi i32 [ [[FLAGS:%.*]], [[WHILE_COND]] ], [ [[COND_SI_UNFOLD_PHI]], [[TOUNFOLD_SI_UNFOLD_FALSE]] ], [ [[DOTSI_UNFOLD_PHI2]], [[COND1_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI3]], [[COND1_SI_UNFOLD_FALSE]] ]
-; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ [[FLAGS]], [[WHILE_COND]] ], [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE]] ], [ 0, [[COND1_SI_UNFOLD_TRUE]] ], [ 0, [[COND1_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[UNFOLDED:%.*]] = phi i32 [ [[FLAGS:%.*]], [[WHILE_COND]] ], [ [[COND_SI_UNFOLD_PHI]], [[TOUNFOLD_SI_UNFOLD_FALSE1]] ], [ poison, [[COND1_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI3]], [[COND1_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ [[FLAGS]], [[WHILE_COND]] ], [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE1]] ], [ 0, [[COND1_SI_UNFOLD_TRUE1]] ], [ 0, [[COND1_SI_UNFOLD_FALSE]] ]
 ; CHECK-NEXT:    switch i32 [[UNFOLDED]], label [[UNREACHABLE:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
 ; CHECK-NEXT:    ]
+; CHECK:       if.end.jt1:
+; CHECK-NEXT:    [[UNFOLDED_JT1:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI3_JT1]], [[COND1_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT:    [[OTHER_JT1:%.*]] = phi i32 [ 0, [[COND1_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT:    br label [[UNREACHABLE]]
+; CHECK:       if.end.jt3:
+; CHECK-NEXT:    [[UNFOLDED_JT3:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI2]], [[COND1_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    [[OTHER_JT3:%.*]] = phi i32 [ 0, [[COND1_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    br label [[UNREACHABLE]]
+; CHECK:       if.end.jt0:
+; CHECK-NEXT:    [[UNFOLDED_JT0:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT0]], [[TOUNFOLD_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    [[OTHER_JT0:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[SW_BB]]
+; CHECK:       if.end.jt2:
+; CHECK-NEXT:    [[UNFOLDED_JT2:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT2]], [[TOUNFOLD_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[OTHER_JT2:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    br label [[UNREACHABLE]]
 ; CHECK:       unreachable:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       sw.bb:
diff --git a/llvm/test/Transforms/DFAJumpThreading/negative.ll b/llvm/test/Transforms/DFAJumpThreading/negative.ll
index a964281..3eab1e1 100644
--- a/llvm/test/Transforms/DFAJumpThreading/negative.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/negative.ll
@@ -218,9 +218,45 @@ for.end:
 declare i32 @arbitrary_function()
 
 ; Don't confuse %state.2 for the initial switch value.
+; [ 3, %case2 ] can still be threaded.
 define i32 @negative6(i32 %init) {
-; REMARK: SwitchNotPredictable
-; REMARK-NEXT: negative6
+; CHECK-LABEL: define i32 @negative6(
+; CHECK-SAME: i32 [[INIT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INIT]], 0
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[STATE_2:%.*]] = call i32 @arbitrary_function()
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ [[STATE_2]], %[[LOOP_2]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label %[[INFLOOP_I:.*]] [
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 3, label %[[CASE3:.*]]
+; CHECK-NEXT:      i32 4, label %[[CASE4:.*]]
+; CHECK-NEXT:      i32 0, label %[[CASE0:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[LOOP_3_JT3:.*]]:
+; CHECK-NEXT:    [[STATE_JT3:%.*]] = phi i32 [ 3, %[[CASE2]] ]
+; CHECK-NEXT:    br label %[[CASE3]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    br label %[[LOOP_3_JT3]]
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_2_BACKEDGE:.*]], label %[[CASE4]]
+; CHECK:       [[CASE4]]:
+; CHECK-NEXT:    br label %[[LOOP_2_BACKEDGE]]
+; CHECK:       [[LOOP_2_BACKEDGE]]:
+; CHECK-NEXT:    br label %[[LOOP_2]]
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[INFLOOP_I]]:
+; CHECK-NEXT:    br label %[[INFLOOP_I]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %cmp = icmp eq i32 %init, 0
   br label %loop.2
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll
new file mode 100644
index 0000000..0f8cf5c7
--- /dev/null
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,RISCV64
+; RUN: opt -mtriple=riscv32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,RISCV32
+; RUN: opt -mtriple=loongarch64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,LOONGARCH64
+; RUN: opt -mtriple=loongarch32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,LOONGARCH32
+; RUN: opt -mtriple=aarch64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
+; RUN: opt -mtriple=aarch64_be -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64_BE
+; RUN: opt -mtriple=aarch64_32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64_32
+
+define void @f1() "instrument-function-entry-inlined"="_mcount" {
+; CHECK-LABEL: define void @f1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @_mcount(ptr [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AARCH64: {{.*}}
+; AARCH64_32: {{.*}}
+; AARCH64_BE: {{.*}}
+; LOONGARCH32: {{.*}}
+; LOONGARCH64: {{.*}}
+; RISCV32: {{.*}}
+; RISCV64: {{.*}}
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
index bd5f4c2..56ccfb9 100644
--- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
@@ -129,6 +129,13 @@ define void @naked() naked {
   ret void
 }
 
+define available_externally void @always_inline() {
+; CHECK-LABEL: define available_externally void @always_inline() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
 ; The attributes are "consumed" when the instrumentation is inserted.
 ; CHECK: attributes
 ; CHECK-NOT: instrument-function
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
index 9243969..179b5b0 100644
--- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -45,11 +45,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i24 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
@@ -63,11 +59,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP3]], i32 [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
@@ -83,11 +75,7 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
@@ -103,11 +91,7 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
@@ -155,11 +139,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP3]], i64 [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index d71ae8b..0507ec9 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -71,11 +71,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
 ; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X32-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; X32-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 4)
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index f686e29..86dc3e5 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -73,11 +73,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
 ; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; X64-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
@@ -189,11 +185,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
 ; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; X64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; X64-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
index 50c4dc4..1840f04 100644
--- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll
+++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
@@ -3,13 +3,9 @@
 ; RUN: echo '!19 = !{!"%/t/exit-block.ll", !0}' > %t/1
 ; RUN: cat %s %t/1 > %t/2
 
-; By default, the exit block is the second.
+; The exit block is the second.
 ; RUN: opt -passes=insert-gcov-profiling -disable-output %t/2
-; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s
-
-; But we can optionally emit it last, to match GCC<4.8 (r189778).
-; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='407*' -disable-output %t/2
-; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-LAST %s
+; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -66,10 +62,7 @@ attributes #2 = { nounwind }
 
 ; There should be no destination edges for the exit block.
 ; CHECK: Block : 1 Counter : 0
-; EXIT-LAST:       Destination Edges
-; EXIT-SECOND-NOT: Destination Edges
 ; CHECK: Block : 2 Counter : 0
 ; CHECK: Block : 4 Counter : 0
-; EXIT-LAST-NOT: Destination Edges
-; EXIT-SECOND:   Destination Edges
+; CHECK:         Destination Edges
 ; CHECK-NOT: Block :
diff --git a/llvm/test/Transforms/GCOVProfiling/version.ll b/llvm/test/Transforms/GCOVProfiling/version.ll
index bfac255..4751bc1 100644
--- a/llvm/test/Transforms/GCOVProfiling/version.ll
+++ b/llvm/test/Transforms/GCOVProfiling/version.ll
@@ -5,16 +5,16 @@
 ; RUN: cat %t/little.txt %s %t/version.txt > %t/2
 
 ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/2
-; RUN: head -c8 %t/version.gcno | grep '^oncg.804'
+; RUN: head -c8 %t/version.gcno | grep '^oncg.11B'
 ; RUN: rm %t/version.gcno
 ; RUN: not opt -passes=insert-gcov-profiling -default-gcov-version=asdfasdf -disable-output < %t/2
-; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='402*' -disable-output < %t/2
-; RUN: head -c8 %t/version.gcno | grep '^oncg.204'
+; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='B21*' -disable-output < %t/2
+; RUN: head -c8 %t/version.gcno | grep '^oncg.12B'
 ; RUN: rm %t/version.gcno
 
 ; RUN: cat %t/big.txt %s %t/version.txt > %t/big.ll
 ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/big.ll
-; RUN: head -c8 %t/version.gcno | grep '^gcno408.'
+; RUN: head -c8 %t/version.gcno | grep '^gcnoB11.'
 
 define void @test() !dbg !5 {
   ret void, !dbg !8
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
index 7ffbd64..4cd1491 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
@@ -391,9 +391,33 @@ define void @nonzt0_callee() {
   ret void
 }
 
+define void @new_zt0_callee() "aarch64_new_zt0" {
+; CHECK-LABEL: define void @new_zt0_callee
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; inlineasm", ""()
+  call void @inlined_body()
+  ret void
+}
+
+define void @nonzt0_caller_new_zt0_callee_dont_inline() {
+; CHECK-LABEL: define void @nonzt0_caller_new_zt0_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @new_zt0_callee()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @new_zt0_callee()
+  ret void
+}
+
 define void @shared_zt0_caller_nonzt0_callee_dont_inline() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_caller_nonzt0_callee_dont_inline
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    call void @nonzt0_callee()
 ; CHECK-NEXT:    ret void
 ;
@@ -403,7 +427,7 @@ define void @shared_zt0_caller_nonzt0_callee_dont_inline() "aarch64_inout_zt0" {
 
 define void @shared_zt0_callee() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_callee
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR5]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
@@ -415,7 +439,7 @@ define void @shared_zt0_callee() "aarch64_inout_zt0" {
 
 define void @shared_zt0_caller_shared_zt0_callee_inline() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_caller_shared_zt0_callee_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR5]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll b/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
index 84462f9..d4edf12e 100644
--- a/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
+++ b/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
@@ -12,6 +12,17 @@ define i8 @add-shl-sdiv-scalar0(i8 %x) {
   ret i8 %rz
 }
 
+define i8 @add-shl-sdiv-scalar0_commuted(i8 %x) {
+; CHECK-LABEL: @add-shl-sdiv-scalar0_commuted(
+; CHECK-NEXT:    [[RZ:%.*]] = srem i8 [[X:%.*]], 4
+; CHECK-NEXT:    ret i8 [[RZ]]
+;
+  %sd = sdiv i8 %x, -4
+  %sl = shl i8 %sd, 2
+  %rz = add i8 %x, %sl
+  ret i8 %rz
+}
+
 define i8 @add-shl-sdiv-scalar1(i8 %x) {
 ; CHECK-LABEL: @add-shl-sdiv-scalar1(
 ; CHECK-NEXT:    [[RZ:%.*]] = srem i8 [[X:%.*]], 64
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index fffe1f8..9651858 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -1445,8 +1445,7 @@ define i1 @bitwise_and_logical_and_icmps_comm2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @bitwise_and_logical_and_icmps_comm2(
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42
 ; CHECK-NEXT:    [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i8 [[Z_SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[Z_SHIFT]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[X:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i8 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[TMP4]], i1 [[C1]], i1 false
@@ -1796,8 +1795,7 @@ define i1 @bitwise_or_logical_or_icmps_comm2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @bitwise_or_logical_or_icmps_comm2(
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42
 ; CHECK-NEXT:    [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i8 [[Z_SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[Z_SHIFT]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[X:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i8 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[OR2:%.*]] = select i1 [[TMP4]], i1 true, i1 [[C1]]
@@ -2068,12 +2066,10 @@ define i1 @bitwise_and_logical_and_masked_icmp_allzeros_poison1(i1 %c, i32 %x, i
 
 define i1 @bitwise_and_logical_and_masked_icmp_allzeros_poison2(i1 %c, i32 %x, i32 %y) {
 ; CHECK-LABEL: @bitwise_and_logical_and_masked_icmp_allzeros_poison2(
-; CHECK-NEXT:    [[X_M1:%.*]] = and i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[C1:%.*]] = icmp eq i32 [[X_M1]], 0
-; CHECK-NEXT:    [[AND1:%.*]] = select i1 [[C1]], i1 [[C:%.*]], i1 false
-; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = or i32 [[Y1:%.*]], 8
+; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X_M2]], 0
-; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C2]]
+; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[C2]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    ret i1 [[AND2]]
 ;
   %x.m1 = and i32 %x, 8
@@ -2120,12 +2116,10 @@ define i1 @bitwise_and_logical_and_masked_icmp_allones_poison1(i1 %c, i32 %x, i3
 
 define i1 @bitwise_and_logical_and_masked_icmp_allones_poison2(i1 %c, i32 %x, i32 %y) {
 ; CHECK-LABEL: @bitwise_and_logical_and_masked_icmp_allones_poison2(
-; CHECK-NEXT:    [[X_M1:%.*]] = and i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[C1:%.*]] = icmp ne i32 [[X_M1]], 0
-; CHECK-NEXT:    [[AND1:%.*]] = select i1 [[C1]], i1 [[C:%.*]], i1 false
-; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = or i32 [[Y1:%.*]], 8
+; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X_M2]], [[Y]]
-; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C2]]
+; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[C2]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    ret i1 [[AND2]]
 ;
   %x.m1 = and i32 %x, 8
diff --git a/llvm/test/Transforms/InstCombine/and-xor-or.ll b/llvm/test/Transforms/InstCombine/and-xor-or.ll
index 5a0890e..5a58995 100644
--- a/llvm/test/Transforms/InstCombine/and-xor-or.ll
+++ b/llvm/test/Transforms/InstCombine/and-xor-or.ll
@@ -388,10 +388,9 @@ define i8 @xor_shl(i8 %x, i8 %y, i8 %zarg, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@xor_shl
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[ZARG:%.*]], i8 [[SHAMT:%.*]]) {
 ; CHECK-NEXT:    [[Z:%.*]] = sdiv i8 42, [[ZARG]]
-; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = xor i8 [[Z]], [[SX]]
-; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[TMP2]], [[Z]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %z = sdiv i8 42, %zarg ; thwart complexity-based canonicalization
@@ -406,10 +405,9 @@ define i8 @and_lshr(i8 %x, i8 %y, i8 %zarg, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@and_lshr
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[ZARG:%.*]], i8 [[SHAMT:%.*]]) {
 ; CHECK-NEXT:    [[Z:%.*]] = sdiv i8 42, [[ZARG]]
-; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = and i8 [[Z]], [[SX]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SY]], [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[TMP2]], [[Z]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %z = sdiv i8 42, %zarg ; thwart complexity-based canonicalization
@@ -435,6 +433,51 @@ define i8 @or_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
   ret i8 %r
 }
 
+define i8 @or_lshr_commuted1(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted1
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %sy, %a
+  ret i8 %r
+}
+
+define i8 @or_lshr_commuted2(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted2
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @or_lshr_commuted3(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted3
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %sx, %z
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
 define i8 @xor_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@xor_lshr
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
diff --git a/llvm/test/Transforms/InstCombine/bit-checks.ll b/llvm/test/Transforms/InstCombine/bit-checks.ll
index 43cd6dd..936c02c 100644
--- a/llvm/test/Transforms/InstCombine/bit-checks.ll
+++ b/llvm/test/Transforms/InstCombine/bit-checks.ll
@@ -1335,6 +1335,22 @@ define i1 @no_masks_with_logical_or(i32 %a, i32 %b, i32 noundef %c) {
   ret i1 %or2
 }
 
+define i1 @no_masks_with_logical_or_commuted(i32 %a, i32 %b, i32 noundef %c) {
+; CHECK-LABEL: @no_masks_with_logical_or_commuted(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 63
+; CHECK-NEXT:    [[C:%.*]] = or i32 [[C1:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[C]], 0
+; CHECK-NEXT:    [[OR2:%.*]] = select i1 [[CMP3]], i1 true, i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR2]]
+;
+  %cmp1 = icmp ne i32 %a, 0
+  %cmp2 = icmp ne i32 %b, 63
+  %or1 = select i1 %cmp1, i1 true, i1 %cmp2
+  %cmp3 = icmp ne i32 %c, 0
+  %or2 = or i1 %cmp3, %or1
+  ret i1 %or2
+}
+
 define i1 @no_masks_with_logical_or2(i32 %a, i32 %b, i32 noundef %c) {
 ; CHECK-LABEL: @no_masks_with_logical_or2(
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 63
diff --git a/llvm/test/Transforms/InstCombine/compare-signs.ll b/llvm/test/Transforms/InstCombine/compare-signs.ll
index 9703b47..59ec9ad 100644
--- a/llvm/test/Transforms/InstCombine/compare-signs.ll
+++ b/llvm/test/Transforms/InstCombine/compare-signs.ll
@@ -152,6 +152,19 @@ define i1 @test4a(i32 %a) {
   ret i1 %c
 }
 
+define i1 @test4a_commuted(i32 %a) {
+; CHECK-LABEL: @test4a_commuted(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[SIGNUM:%.*]], 1
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %l = ashr i32 %a, 31
+  %na = sub i32 0, %a
+  %r = lshr i32 %na, 31
+  %signum = or i32 %r, %l
+  %c = icmp slt i32 %signum, 1
+  ret i1 %c
+}
+
 define <2 x i1> @test4a_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @test4a_vec(
 ; CHECK-NEXT:    [[C:%.*]] = icmp slt <2 x i32> [[A:%.*]], splat (i32 1)
diff --git a/llvm/test/Transforms/InstCombine/copysign.ll b/llvm/test/Transforms/InstCombine/copysign.ll
index abc707a..ee093a7 100644
--- a/llvm/test/Transforms/InstCombine/copysign.ll
+++ b/llvm/test/Transforms/InstCombine/copysign.ll
@@ -82,7 +82,7 @@ define float @not_known_positive_sign_arg(float %x, float %y) {
 
 define float @copysign_sign_arg(float %x, float %y, float %z) {
 ; CHECK-LABEL: @copysign_sign_arg(
-; CHECK-NEXT:    [[R:%.*]] = call ninf float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %s = call reassoc float @llvm.copysign.f32(float %y, float %z)
@@ -90,6 +90,26 @@ define float @copysign_sign_arg(float %x, float %y, float %z) {
   ret float %r
 }
 
+define float @copysign_sign_arg_nnan(float %x, float %y, float %z) {
+; CHECK-LABEL: @copysign_sign_arg_nnan(
+; CHECK-NEXT:    [[R:%.*]] = call nnan float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %s = call nnan float @llvm.copysign.f32(float %y, float %z)
+  %r = call nnan float @llvm.copysign.f32(float %x, float %s)
+  ret float %r
+}
+
+define float @copysign_sign_arg_mixed(float %x, float %y, float %z) {
+; CHECK-LABEL: @copysign_sign_arg_mixed(
+; CHECK-NEXT:    [[R:%.*]] = call nsz float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %s = call ninf nsz float @llvm.copysign.f32(float %y, float %z)
+  %r = call nnan nsz float @llvm.copysign.f32(float %x, float %s)
+  ret float %r
+}
+
 define float @fneg_mag(float %x, float %y) {
 ; CHECK-LABEL: @fneg_mag(
 ; CHECK-NEXT:    [[R:%.*]] = call float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
diff --git a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
index 0d0af91..15fad55 100644
--- a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
@@ -19,7 +19,7 @@ define float @test_fcmp_ogt_fadd_select_constant(float %in) {
 define float @test_fcmp_ogt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -87,7 +87,7 @@ define float @test_fcmp_olt_fadd_select_constant(float %in) {
 define float @test_fcmp_olt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_olt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -155,7 +155,7 @@ define float @test_fcmp_oge_fadd_select_constant(float %in) {
 define float @test_fcmp_oge_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_oge_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -223,7 +223,7 @@ define float @test_fcmp_ole_fadd_select_constant(float %in) {
 define float @test_fcmp_ole_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ole_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -293,7 +293,7 @@ define float @test_fcmp_ugt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ugt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp ole float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -366,7 +366,7 @@ define float @test_fcmp_uge_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_uge_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp olt float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -439,7 +439,7 @@ define float @test_fcmp_ult_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ult_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp oge float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -512,7 +512,7 @@ define float @test_fcmp_ule_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ule_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp ogt float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll
index 579247a..a8cdf80 100644
--- a/llvm/test/Transforms/InstCombine/icmp-add.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-add.ll
@@ -79,6 +79,19 @@ bb:
   ret i1 %i4
 }
 
+define i1 @cvt_icmp_0_zext_plus_zext_eq_i2(i1 %a, i1 %b) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_zext_eq_i2(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = xor i1 [[TMP1]], true
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a.ext = zext i1 %a to i2
+  %b.ext = zext i1 %b to i2
+  %add = add i2 %a.ext, %b.ext
+  %cmp = icmp eq i2 %add, 0
+  ret i1 %cmp
+}
+
 define i1 @cvt_icmp_1_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
 ; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_eq(
 ; CHECK-NEXT:  bb:
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index f9b90c2..7f8f1ae 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -709,3 +709,51 @@ define i1 @pointer_icmp_aligned_with_offset_negative(ptr align 8 %a, ptr align 8
   %cmp = icmp eq ptr %gep, %a2
   ret i1 %cmp
 }
+
+define i1 @gep_diff_base_same_indices(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr i8, ptr %x, i64 %z
+  %gep2 = getelementptr i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nuw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nusw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw_nusw(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr nuw i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr nusw i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/onehot_merge.ll b/llvm/test/Transforms/InstCombine/onehot_merge.ll
index 2e57597..3b7314d 100644
--- a/llvm/test/Transforms/InstCombine/onehot_merge.ll
+++ b/llvm/test/Transforms/InstCombine/onehot_merge.ll
@@ -1143,3 +1143,19 @@ define i1 @foo1_and_signbit_lshr_without_shifting_signbit_not_pwr2_logical(i32 %
   %or = select i1 %t2, i1 true, i1 %t4
   ret i1 %or
 }
+
+define i1 @two_types_of_bittest(i8 %x, i8 %c) {
+; CHECK-LABEL: @two_types_of_bittest(
+; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[C:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[T0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %t0 = shl i8 1, %c
+  %icmp1 = icmp slt i8 %x, 0
+  %and = and i8 %x, %t0
+  %icmp2 = icmp ne i8 %and, 0
+  %ret = and i1 %icmp1, %icmp2
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index bac51c8..b052746 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -654,6 +654,64 @@ join:
   ret ptr %gep
 }
 
+define ptr @gep_of_phi_of_gep_flags1(i1 %c, ptr %p) {
+; CHECK-LABEL: @gep_of_phi_of_gep_flags1(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 1
+  br label %join
+
+else:
+  %gep2 = getelementptr i32, ptr %p, i64 2
+  br label %join
+
+join:
+  %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ]
+  %gep = getelementptr i32, ptr %phi, i64 1
+  ret ptr %gep
+}
+
+define ptr @gep_of_phi_of_gep_flags2(i1 %c, ptr %p) {
+; CHECK-LABEL: @gep_of_phi_of_gep_flags2(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr nuw i8, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %gep1 = getelementptr nuw i32, ptr %p, i64 1
+  br label %join
+
+else:
+  %gep2 = getelementptr nuw i32, ptr %p, i64 2
+  br label %join
+
+join:
+  %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ]
+  %gep = getelementptr i32, ptr %phi, i64 1
+  ret ptr %gep
+}
+
 define ptr @gep_of_phi_of_gep_different_type(i1 %c, ptr %p) {
 ; CHECK-LABEL: @gep_of_phi_of_gep_different_type(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/overflow.ll b/llvm/test/Transforms/InstCombine/overflow.ll
index a8969a5..22e1631 100644
--- a/llvm/test/Transforms/InstCombine/overflow.ll
+++ b/llvm/test/Transforms/InstCombine/overflow.ll
@@ -11,7 +11,7 @@ define i32 @test1(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { i32, i1 } [[SADD]], 1
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SADD_RESULT:%.*]] = extractvalue { i32, i1 } [[SADD]], 0
@@ -49,7 +49,7 @@ define i32 @test2(i32 %a, i32 %b, ptr %P) nounwind ssp {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[ADD_OFF]], 4294967295
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CONV9:%.*]] = trunc i64 [[ADD]] to i32
@@ -86,7 +86,7 @@ define i64 @test3(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], -4294967296
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret i64 [[ADD]]
@@ -116,7 +116,7 @@ define zeroext i8 @test4(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-NEXT:    [[CMP:%.*]] = extractvalue { i8, i1 } [[SADD]], 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SADD_RESULT:%.*]] = extractvalue { i8, i1 } [[SADD]], 0
@@ -150,7 +150,7 @@ define i32 @test8(i64 %a, i64 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], -4294967296
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CONV9:%.*]] = trunc i64 [[ADD]] to i32
@@ -171,3 +171,91 @@ if.end:
   ret i32 %conv9
 }
 
+define i32 @uadd_no_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @uadd_no_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @smul_no_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @smul_no_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %val = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @smul_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @smul_overflow(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[OV]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %val = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  tail call void @llvm.assume(i1 %ov)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @uadd_no_overflow_invalid1(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @uadd_no_overflow_invalid1(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    call void @use(i32 [[RES]])
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    [[NOWRAP:%.*]] = xor i1 [[OV]], true
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOWRAP]])
+; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %res = extractvalue { i32, i1 } %val, 0
+  call void @use(i32 %res)
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  ret i32 %res
+if.else:
+  ret i32 0
+}
+
+define i32 @uadd_no_overflow_invalid2(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @uadd_no_overflow_invalid2(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    [[NOWRAP:%.*]] = xor i1 [[OV]], true
+; CHECK-NEXT:    call void @use(i32 0)
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOWRAP]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  call void @use(i32 0) ; It is not guaranteed to transfer execution to its successors
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+declare void @use(i32)
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index e3bf5ef..4756b4f 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -2822,3 +2822,179 @@ for.cond:                                         ; preds = %for.cond, %entry
 exit:                                             ; preds = %for.cond
   ret i64 0
 }
+
+define i1 @test_zext_icmp_eq_0(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[B:%.*]], true
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[C:%.*]], 0
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[CMP:%.*]] = phi i1 [ [[TMP0]], [[IF]] ], [ [[TMP1]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_ne_0(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_ne_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[C:%.*]], 0
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[B:%.*]], [[IF]] ], [ [[TMP0]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp ne i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_1(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[C:%.*]], 1
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[B:%.*]], [[IF]] ], [ [[TMP0]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 1
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_0_loop(i1 %c, i1 %b) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[Y:%.*]] = and i1 [[X]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP0]] = xor i1 [[Y]], true
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[X]]
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi i32 [ 1, %entry ], [ %ext, %loop ]
+  %x = icmp eq i32 %phi, 0
+  %y = and i1 %x, %b
+  %ext = zext i1 %y to i32
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret i1 %x
+}
+
+define i1 @test_zext_icmp_eq_0_multi_use(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_multi_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i1 [[B:%.*]] to i32
+; CHECK-NEXT:    call void @use(i32 [[B_EXT]])
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  call void @use(i32 %b.ext)
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_0_not_bool(i1 %a, i2 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_not_bool(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i2 [[B:%.*]] to i32
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i2 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
index e7d6cc7..920497c0 100644
--- a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
+++ b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
@@ -372,6 +372,32 @@ define <2 x i8> @srem_XY_XZ_with_CY_gt_CZ_no_nuw_out(<2 x i8> %X) {
   ret <2 x i8> %r
 }
 
+define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw(i8 noundef %X) {
+; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw(
+; CHECK-NEXT:    [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127
+; CHECK-NEXT:    [[BO1:%.*]] = shl nsw i8 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = srem i8 [[BO1]], [[BO0]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %BO0 = mul nsw i8 %X, 127
+  %BO1 = shl nsw i8 %X, 7
+  %r = srem i8 %BO1, %BO0
+  ret i8 %r
+}
+
+define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted(i8 noundef %X) {
+; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted(
+; CHECK-NEXT:    [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127
+; CHECK-NEXT:    [[BO1:%.*]] = shl nsw i8 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = srem i8 [[BO0]], [[BO1]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %BO0 = mul nsw i8 %X, 127
+  %BO1 = shl nsw i8 %X, 7
+  %r = srem i8 %BO0, %BO1
+  ret i8 %r
+}
+
 define i8 @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1(i8 %X) {
 ; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1(
 ; CHECK-NEXT:    [[BO0:%.*]] = mul nuw nsw i8 [[X:%.*]], 10
diff --git a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
index 35b4087..2cb70e8 100644
--- a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
+++ b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
@@ -495,6 +495,19 @@ define i32 @test_cttz_not_bw(i32 %x) {
   ret i32 %res
 }
 
+define i32 @test_cttz_not_bw_noundef(i32 %x) {
+; CHECK-LABEL: @test_cttz_not_bw_noundef(
+; CHECK-NEXT:    [[CT:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP_NOT]], i32 123, i32 [[CT]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %ct = tail call noundef i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %cmp = icmp ne i32 %x, 0
+  %res = select i1 %cmp, i32 %ct, i32 123
+  ret i32 %res
+}
+
 define i32 @test_cttz_not_bw_multiuse(i32 %x) {
 ; CHECK-LABEL: @test_cttz_not_bw_multiuse(
 ; CHECK-NEXT:    [[CT:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false)
diff --git a/llvm/test/Transforms/InstCombine/select-divrem.ll b/llvm/test/Transforms/InstCombine/select-divrem.ll
index a674f9c..7dff78e 100644
--- a/llvm/test/Transforms/InstCombine/select-divrem.ll
+++ b/llvm/test/Transforms/InstCombine/select-divrem.ll
@@ -322,6 +322,21 @@ define i8 @rem_euclid_non_const_pow2(i8 %0, i8 %1) {
   ret i8 %sel
 }
 
+define i8 @rem_euclid_non_const_pow2_commuted(i8 %0, i8 %1) {
+; CHECK-LABEL: @rem_euclid_non_const_pow2_commuted(
+; CHECK-NEXT:    [[NOTMASK:%.*]] = shl nsw i8 -1, [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i8 [[NOTMASK]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[TMP1:%.*]], [[TMP3]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %pow2 = shl i8 1, %0
+  %rem = srem i8 %1, %pow2
+  %cond = icmp slt i8 %rem, 0
+  %add = add i8 %pow2, %rem
+  %sel = select i1 %cond, i8 %add, i8 %rem
+  ret i8 %sel
+}
+
 define i32 @rem_euclid_pow2_true_arm_folded(i32 %n) {
 ; CHECK-LABEL: @rem_euclid_pow2_true_arm_folded(
 ; CHECK-NEXT:    [[RES:%.*]] = and i32 [[N:%.*]], 1
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 0168a80..0f15fa6 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3937,11 +3937,8 @@ entry:
 define i32 @src_or_eq_0_and_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_or_eq_0_and_xor(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
 entry:
   %or = or i32 %y, %x
@@ -3956,11 +3953,8 @@ entry:
 define i32 @src_or_eq_0_xor_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_or_eq_0_xor_and(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
 entry:
   %or = or i32 %y, %x
@@ -4438,11 +4432,8 @@ define i32 @src_no_trans_select_and_eq0_xor_and(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_and(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp eq i32 %or, 0
@@ -4453,11 +4444,8 @@ define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_or_eq0_or_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_xor(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp eq i32 %or, 0
@@ -4492,11 +4480,8 @@ define i32 @src_no_trans_select_or_eq0_xor_or(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_and_ne0_xor_or(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_and_ne0_xor_or(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0_NOT:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0_NOT]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp ne i32 %or, 0
@@ -4843,3 +4828,16 @@ define i32 @replace_and_cond_multiuse2(i1 %cond1, i1 %cond2) {
   %mux = select i1 %cond1, i32 %sel, i32 1
   ret i32 %mux
 }
+
+define i32 @src_simplify_2x_at_once_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_simplify_2x_at_once_and(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, -1
+  %sub = sub i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %sub, i32 %xor
+  ret i32 %cond
+}
diff --git a/llvm/test/Transforms/InstCombine/xor-and-or.ll b/llvm/test/Transforms/InstCombine/xor-and-or.ll
index 47275ce..c380e27 100644
--- a/llvm/test/Transforms/InstCombine/xor-and-or.ll
+++ b/llvm/test/Transforms/InstCombine/xor-and-or.ll
@@ -25,6 +25,18 @@ define i1 @xor_logic_and_logic_or2(i1 %c, i1 %x, i1 %y) {
   ret i1 %r
 }
 
+define i1 @xor_logic_and_logic_or2_commuted(i1 %c, i1 %x, i1 %y) {
+; CHECK-LABEL: @xor_logic_and_logic_or2_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X:%.*]], true
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C:%.*]], i1 [[TMP1]], i1 [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %o = select i1 %y, i1 true, i1 %c
+  %a = select i1 %c, i1 %x, i1 false
+  %r = xor i1 %o, %a
+  ret i1 %r
+}
+
 define i1 @xor_logic_and_logic_or3(i1 %c, i1 %x, i1 %y) {
 ; CHECK-LABEL: @xor_logic_and_logic_or3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i1 [[C:%.*]]
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
new file mode 100644
index 0000000..543c731
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
@@ -0,0 +1,1129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; f2i/f2ui and d2i/d2ui - double/float to i32 tests
+
+;###############################################################
+;#               Tests with Positive 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_f2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_d2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_d2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 1.5)
+  ret i32 %res
+}
+
+;###############################################################
+;#               Tests with Negative 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_f2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float -1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_d2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rm() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rn() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_d2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rp() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rn(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rp(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rz(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double -1.5)
+  ret i32 %res
+}
+
+;###############################################################
+;#                    Tests with NaN                           #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2i_rm() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rn() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_f2i_rp() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_d2i_rm() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2i_rn() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_d2i_rp() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2i_rz() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+;###############################################################
+;#            Tests with Positive Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_f2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_d2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_d2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+;###############################################################
+;#            Tests with Negative Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_f2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_d2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rm() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_d2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double 0x800FFFFFFFFFFFFF)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+declare i32 @llvm.nvvm.f2i.rm(float)
+declare i32 @llvm.nvvm.f2i.rn(float)
+declare i32 @llvm.nvvm.f2i.rp(float)
+declare i32 @llvm.nvvm.f2i.rz(float)
+
+declare i32 @llvm.nvvm.f2i.rm.ftz(float)
+declare i32 @llvm.nvvm.f2i.rn.ftz(float)
+declare i32 @llvm.nvvm.f2i.rp.ftz(float)
+declare i32 @llvm.nvvm.f2i.rz.ftz(float)
+
+declare i32 @llvm.nvvm.d2i.rm(double)
+declare i32 @llvm.nvvm.d2i.rn(double)
+declare i32 @llvm.nvvm.d2i.rp(double)
+declare i32 @llvm.nvvm.d2i.rz(double)
+
+
+declare i32 @llvm.nvvm.f2ui.rm(float)
+declare i32 @llvm.nvvm.f2ui.rn(float)
+declare i32 @llvm.nvvm.f2ui.rp(float)
+declare i32 @llvm.nvvm.f2ui.rz(float)
+
+declare i32 @llvm.nvvm.f2ui.rm.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rn.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rp.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rz.ftz(float)
+
+declare i32 @llvm.nvvm.d2ui.rm(double)
+declare i32 @llvm.nvvm.d2ui.rn(double)
+declare i32 @llvm.nvvm.d2ui.rp(double)
+declare i32 @llvm.nvvm.d2ui.rz(double)
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll
new file mode 100644
index 0000000..be38177
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll
@@ -0,0 +1,1129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; f2ll/f2ull and d2ll/d2ull - double/float to i64 tests
+
+;###############################################################
+;#               Tests with Positive 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 1.5)
+  ret i64 %res
+}
+
+;###############################################################
+;#               Tests with Negative 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float -1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rm() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rn() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rp() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rn(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rp(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rz(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double -1.5)
+  ret i64 %res
+}
+
+;###############################################################
+;#                    Tests with NaN                           #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+;###############################################################
+;#            Tests with Positive Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+;###############################################################
+;#            Tests with Negative Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rm() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double 0x800FFFFFFFFFFFFF)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+declare i64 @llvm.nvvm.f2ll.rm(float)
+declare i64 @llvm.nvvm.f2ll.rn(float)
+declare i64 @llvm.nvvm.f2ll.rp(float)
+declare i64 @llvm.nvvm.f2ll.rz(float)
+
+declare i64 @llvm.nvvm.f2ll.rm.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rn.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rp.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rz.ftz(float)
+
+declare i64 @llvm.nvvm.d2ll.rm(double)
+declare i64 @llvm.nvvm.d2ll.rn(double)
+declare i64 @llvm.nvvm.d2ll.rp(double)
+declare i64 @llvm.nvvm.d2ll.rz(double)
+
+
+declare i64 @llvm.nvvm.f2ull.rm(float)
+declare i64 @llvm.nvvm.f2ull.rn(float)
+declare i64 @llvm.nvvm.f2ull.rp(float)
+declare i64 @llvm.nvvm.f2ull.rz(float)
+
+declare i64 @llvm.nvvm.f2ull.rm.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rn.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rp.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rz.ftz(float)
+
+declare i64 @llvm.nvvm.d2ull.rm(double)
+declare i64 @llvm.nvvm.d2ull.rn(double)
+declare i64 @llvm.nvvm.d2ull.rp(double)
+declare i64 @llvm.nvvm.d2ull.rz(double)
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
new file mode 100644
index 0000000..fe8a7e5
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o - | FileCheck %s
+
+; This test verifies that the vectorizer can handle an extended sequence of
+; getelementptr instructions and generate longer vectors. With special handling,
+; some elements can still be vectorized even if they require looking up the
+; common underlying object deeper than 6 levels from the original pointer.
+
+; The test below is the simplified version of actual performance oriented
+; workload; the offsets in getelementptr instructions are similar or same for
+; the test simplicity.
+
+define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
+; CHECK-NEXT:    [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
+; CHECK-NEXT:    [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
+; CHECK-NEXT:    [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[A6]], align 16
+; CHECK-NEXT:    ret void
+;
+
+  %level1 = getelementptr i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 8
+  %d8 = getelementptr i8, ptr %b7, i32 12
+
+  store half 0xH0000, ptr %a6, align 16
+  store <4 x half> zeroinitializer, ptr %b7, align 2
+  store <2 x half> zeroinitializer, ptr %c8, align 2
+  store half 0xH0000, ptr %d8, align 2
+  ret void
+}
+
+define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: define void @v1x8_levels_6_7_8_9_10_11_12_13(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
+; CHECK-NEXT:    [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
+; CHECK-NEXT:    [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
+; CHECK-NEXT:    [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[A6]], align 16
+; CHECK-NEXT:    ret void
+;
+
+  %level1 = getelementptr i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 2
+  %d9 = getelementptr i8, ptr %c8, i32 2
+  %e10 = getelementptr i8, ptr %d9, i32 2
+  %f11 = getelementptr i8, ptr %e10, i32 2
+  %g12 = getelementptr i8, ptr %f11, i32 2
+  %h13 = getelementptr i8, ptr %g12, i32 2
+
+  store half 0xH0000, ptr %a6, align 16
+  store half 0xH0000, ptr %b7, align 2
+  store half 0xH0000, ptr %c8, align 2
+  store half 0xH0000, ptr %d9, align 2
+  store half 0xH0000, ptr %e10, align 8
+  store half 0xH0000, ptr %f11, align 2
+  store half 0xH0000, ptr %g12, align 2
+  store half 0xH0000, ptr %h13, align 2
+  ret void
+}
+
+define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(i32 %arg0, ptr addrspace(3) align 16 %arg1_ptr, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, half %arg6_half, half %arg7_half, <2 x half> %arg8_2xhalf) {
+; CHECK-LABEL: define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr addrspace(3) align 16 [[ARG1_PTR:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]], i32 [[ARG4:%.*]], i32 [[ARG5:%.*]], half [[ARG6_HALF:%.*]], half [[ARG7_HALF:%.*]], <2 x half> [[ARG8_2XHALF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[ARG1_PTR]], i32 458752
+; CHECK-NEXT:    br [[DOTPREHEADER11_PREHEADER:label %.*]]
+; CHECK:       [[_PREHEADER11_PREHEADER:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[ARG0]], 6
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[ARG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[ARG3]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ARG0]], 2
+; CHECK-NEXT:    br i1 [[CMP]], [[DOTLR_PH:label %.*]], [[DOTEXIT_POINT:label %.*]]
+; CHECK:       [[_LR_PH:.*:]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP5]], i32 [[ARG4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[GEP]], i32 [[ARG5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> poison, half [[ARG6_HALF]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half 0xH0000, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half 0xH0000, i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half 0xH0000, i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half 0xH0000, i32 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x half> [[TMP15]], half [[ARG7_HALF]], i32 7
+; CHECK-NEXT:    store <8 x half> [[TMP16]], ptr addrspace(3) [[TMP6]], align 2
+; CHECK-NEXT:    br [[DOTEXIT_POINT]]
+; CHECK:       [[_EXIT_POINT:.*:]]
+; CHECK-NEXT:    ret void
+;
+  %base1 = getelementptr inbounds i8, ptr addrspace(3) %arg1_ptr, i32 458752
+  br label %.preheader11.preheader
+
+.preheader11.preheader:
+  %base2 = shl nuw nsw i32 %arg0, 6
+  %base3 = getelementptr inbounds i8, ptr addrspace(3) %base1, i32 %base2
+
+  %base4 = getelementptr inbounds i8, ptr addrspace(3) %base3, i32 %arg2
+  %base5 = getelementptr inbounds i8, ptr addrspace(3) %base4, i32 %arg3
+
+  %cmp = icmp sgt i32 %arg0, 2
+  br i1 %cmp, label %.lr.ph, label %.exit_point
+
+.lr.ph:
+  %gep = getelementptr inbounds i8, ptr addrspace(3) %base5, i32 %arg4
+
+  %dst = getelementptr inbounds i8, ptr addrspace(3) %gep, i32 %arg5
+  %dst_off2 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 2
+  %dst_off10 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 10
+  %dst_off14 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 14
+
+  store half %arg6_half, ptr addrspace(3) %dst, align 2
+  store <4 x half> zeroinitializer, ptr addrspace(3) %dst_off2, align 2
+  store <2 x half> %arg8_2xhalf, ptr addrspace(3) %dst_off10, align 2
+  store half %arg7_half, ptr addrspace(3) %dst_off14, align 2
+  br label %.exit_point
+
+.exit_point:
+  ret void
+}
+
+; The regression test for merging equivalence classes. It is reduced and adapted
+; for LSV from llvm/test/CodeGen/NVPTX/variadics-backend.ll, which failed at
+; post-commit checks with memory sanitizer on the initial attempt to implement
+; the merging of the equivalence classes.
+define void @variadics1(ptr %vlist) {
+; CHECK-LABEL: define void @variadics1(
+; CHECK-SAME: ptr [[VLIST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[ARGP_CUR7_ALIGNED2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[VLIST]], i64 0)
+; CHECK-NEXT:    [[ARGP_NEXT8:%.*]] = getelementptr i8, ptr [[ARGP_CUR7_ALIGNED2]], i64 8
+; CHECK-NEXT:    [[X0:%.*]] = getelementptr i8, ptr [[ARGP_NEXT8]], i32 7
+; CHECK-NEXT:    [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X0]], i64 0)
+; CHECK-NEXT:    [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8
+; CHECK-NEXT:    [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7
+; CHECK-NEXT:    [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296
+; CHECK-NEXT:    [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[X5:%.*]] = fadd double [[X42]], [[X31]]
+; CHECK-NEXT:    store double [[X5]], ptr null, align 8
+; CHECK-NEXT:    ret void
+;
+  %argp.cur7.aligned2 = call ptr @llvm.ptrmask.p0.i64(ptr %vlist, i64 0)
+  %argp.next8 = getelementptr i8, ptr %argp.cur7.aligned2, i64 8
+  %x0 = getelementptr i8, ptr %argp.next8, i32 7
+  %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x0, i64 0)
+  %argp.next12 = getelementptr i8, ptr %argp.cur11.aligned, i64 8
+  %x2 = getelementptr i8, ptr %argp.next12, i32 7
+  %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x2, i64 0)
+  %x3 = load double, ptr %argp.cur16.aligned, align 8
+  %argp.cur16.aligned_off8 = getelementptr i8, ptr %argp.cur16.aligned, i32 8
+  %x4 = load double, ptr %argp.cur16.aligned_off8, align 8
+  %x5 = fadd double %x4, %x3
+  store double %x5, ptr null, align 8
+  ret void
+}
+
+declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
index 64e8a6b..7a3817d 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-unknown-unknown -print-lsr-output < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=aarch64 -stop-after=loop-reduce < %s | FileCheck %s
 
 declare void @foo(i64)
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll b/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
index 8761122..e6b5991 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
@@ -13,7 +13,7 @@ target triple = "nvptx64-nvidia-cuda"
 ; That would be worthless, because "i" is simulated by two 32-bit registers and
 ; truncating it to 32-bit is as simple as directly using the register that
 ; contains the low bits.
-define void @trunc_is_free(i64 %begin, i64 %stride, i64 %end) {
+define ptx_kernel void @trunc_is_free(i64 %begin, i64 %stride, i64 %end) {
 ; CHECK-LABEL: @trunc_is_free(
 entry:
   %cmp.4 = icmp eq i64 %begin, %end
@@ -41,5 +41,3 @@ for.body:                                         ; preds = %for.body.preheader,
 
 declare void @_Z3usei(i32)
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @trunc_is_free, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
index 63149ad..d9b8907 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
@@ -1,67 +1,57 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "estimated cost.*frem" --version 4
-
+; REQUIRES: asserts
 ; RUN: opt -mattr=+neon -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-NO-VECLIB
-
 ; RUN: opt -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-NO-VECLIB
-
 ; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-ARMPL
-
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-SLEEF
-
 ; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL
-
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF
-
 ; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL-TAILFOLD
-
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF-TAILFOLD
 
-; REQUIRES: asserts
-
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f64'
 ; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f64'
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-ARMPL-LABEL: 'frem_f64'
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-SLEEF-LABEL: 'frem_f64'
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-LABEL: 'frem_f64'
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-LABEL: 'frem_f64'
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f64'
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f64'
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
   entry:
   br label %for.body
@@ -84,58 +74,58 @@ define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f32'
 ; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-NO-VECLIB:  Cost of 52 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f32'
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of 52 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-ARMPL-LABEL: 'frem_f32'
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-ARMPL:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-SLEEF-LABEL: 'frem_f32'
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-SLEEF:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-LABEL: 'frem_f32'
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-LABEL: 'frem_f32'
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f32'
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f32'
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
   entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index ddf6c10..254cdf2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -209,6 +209,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[C_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
@@ -218,7 +219,6 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], [[TMP3]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> splat (i8 1)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index 4f05087..e63155b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -80,15 +80,13 @@ define void @powi_call(ptr %P) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3)
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -102,7 +100,7 @@ define void @powi_call(ptr %P) {
 ; CHECK-NEXT:    store double [[POWI]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -233,6 +231,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index ac7f147..5b77ced 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -13,9 +13,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
@@ -99,9 +99,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 8673559..caa98d7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -821,11 +821,11 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 8
 ; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; PRED-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
-; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 257, [[TMP7]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 03de9ac..88b14b1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -233,8 +233,8 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 [[START]], [[N_VEC3]]
@@ -409,8 +409,8 @@ define void @test_widen_extended_induction(ptr %dst) {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], <i8 0, i8 1>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
index f7a1eb4..a939969 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
@@ -48,8 +48,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index cbf9bf0..8c5d84e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -91,8 +91,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END6:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
@@ -117,11 +117,11 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    br i1 [[CMP_N11]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END1]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END6]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END2]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL11]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL9]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 1
@@ -151,9 +151,9 @@ exit:
   ret void
 }
 
-define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4, ptr %src, ptr %dst.3, i1 %c.3, ptr %dst.2) {
+define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.ptr, ptr %dst.1, i1 %c.4, ptr %src, ptr %dst.3, i1 %c.3, ptr %dst.2) {
 ; CHECK-LABEL: define void @test_exit_branch_cost(
-; CHECK-SAME: ptr [[DST:%.*]], i64 [[X:%.*]], i32 [[Y:%.*]], ptr [[DST_1:%.*]], i1 [[C_4:%.*]], ptr [[SRC:%.*]], ptr [[DST_3:%.*]], i1 [[C_3:%.*]], ptr [[DST_2:%.*]]) {
+; CHECK-SAME: ptr [[DST:%.*]], ptr noalias [[X_PTR:%.*]], ptr noalias [[Y_PTR:%.*]], ptr [[DST_1:%.*]], i1 [[C_4:%.*]], ptr [[SRC:%.*]], ptr [[DST_3:%.*]], i1 [[C_3:%.*]], ptr [[DST_2:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
@@ -172,11 +172,11 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
-; CHECK-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[CONFLICT_RDX21:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
 ; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX21]], [[FOUND_CONFLICT14]]
 ; CHECK-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
@@ -184,161 +184,101 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
-; CHECK-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; CHECK-NEXT:    [[CONFLICT_RDX41:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
 ; CHECK-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX41]], [[FOUND_CONFLICT26]]
 ; CHECK-NEXT:    [[BOUND028:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND129:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT30:%.*]] = and i1 [[BOUND028]], [[BOUND129]]
-; CHECK-NEXT:    [[CONFLICT_RDX31:%.*]] = or i1 [[CONFLICT_RDX27]], [[FOUND_CONFLICT30]]
+; CHECK-NEXT:    [[CONFLICT_RDX65:%.*]] = or i1 [[CONFLICT_RDX27]], [[FOUND_CONFLICT30]]
 ; CHECK-NEXT:    [[BOUND032:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND133:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT34:%.*]] = and i1 [[BOUND032]], [[BOUND133]]
-; CHECK-NEXT:    [[CONFLICT_RDX35:%.*]] = or i1 [[CONFLICT_RDX31]], [[FOUND_CONFLICT34]]
+; CHECK-NEXT:    [[FOUND_CONFLICT68:%.*]] = and i1 [[BOUND032]], [[BOUND133]]
+; CHECK-NEXT:    [[CONFLICT_RDX35:%.*]] = or i1 [[CONFLICT_RDX65]], [[FOUND_CONFLICT68]]
 ; CHECK-NEXT:    [[BOUND036:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND137:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT38:%.*]] = and i1 [[BOUND036]], [[BOUND137]]
 ; CHECK-NEXT:    [[CONFLICT_RDX39:%.*]] = or i1 [[CONFLICT_RDX35]], [[FOUND_CONFLICT38]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX39]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT40:%.*]] = insertelement <2 x i1> poison, i1 [[C_3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT41:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT40]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT56:%.*]] = insertelement <2 x i1> poison, i1 [[C_4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT57:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT56]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE74:.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE55:.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7:![0-9]+]], !noalias [[META10:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF42:.*]], label %[[PRED_STORE_CONTINUE43:.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF42:.*]], label %[[PRED_STORE_CONTINUE43:.*]]
 ; CHECK:       [[PRED_STORE_IF42]]:
 ; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE43]]
 ; CHECK:       [[PRED_STORE_CONTINUE43]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF44:.*]], label %[[PRED_STORE_CONTINUE45:.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF44:.*]], label %[[PRED_STORE_CONTINUE45:.*]]
 ; CHECK:       [[PRED_STORE_IF44]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
+; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE45]]
 ; CHECK:       [[PRED_STORE_CONTINUE45]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF46:.*]], label %[[PRED_STORE_CONTINUE47:.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF46:.*]], label %[[PRED_STORE_CONTINUE47:.*]]
 ; CHECK:       [[PRED_STORE_IF46]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
+; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE47]]
 ; CHECK:       [[PRED_STORE_CONTINUE47]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF48:.*]], label %[[PRED_STORE_CONTINUE49:.*]]
-; CHECK:       [[PRED_STORE_IF48]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE49]]
-; CHECK:       [[PRED_STORE_CONTINUE49]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF50:.*]], label %[[PRED_STORE_CONTINUE51:.*]]
-; CHECK:       [[PRED_STORE_IF50]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE51]]
-; CHECK:       [[PRED_STORE_CONTINUE51]]:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF52:.*]], label %[[PRED_STORE_CONTINUE53:.*]]
-; CHECK:       [[PRED_STORE_IF52]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE53]]
-; CHECK:       [[PRED_STORE_CONTINUE53]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF54:.*]], label %[[PRED_STORE_CONTINUE55:.*]]
-; CHECK:       [[PRED_STORE_IF54]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE55]]
-; CHECK:       [[PRED_STORE_CONTINUE55]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = or <2 x i1> [[TMP47]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = or <2 x i1> [[TMP47]], [[TMP21]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP20]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[PREDPHI58:%.*]] = select <2 x i1> [[TMP21]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP22]], i32 0
-; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF59:.*]], label %[[PRED_STORE_CONTINUE60:.*]]
-; CHECK:       [[PRED_STORE_IF59]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[PREDPHI]], i32 0
-; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DST_2]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE60]]
-; CHECK:       [[PRED_STORE_CONTINUE60]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP22]], i32 1
-; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF61:.*]], label %[[PRED_STORE_CONTINUE62:.*]]
-; CHECK:       [[PRED_STORE_IF61]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[PREDPHI]], i32 1
-; CHECK-NEXT:    store i64 [[TMP27]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE62]]
-; CHECK:       [[PRED_STORE_CONTINUE62]]:
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP23]], i32 0
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF63:.*]], label %[[PRED_STORE_CONTINUE64:.*]]
-; CHECK:       [[PRED_STORE_IF63]]:
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF48:.*]], label %[[PRED_STORE_CONTINUE49:.*]]
+; CHECK:       [[PRED_STORE_IF48]]:
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[PREDPHI58]], i32 0
-; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE64]]
-; CHECK:       [[PRED_STORE_CONTINUE64]]:
+; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST_2]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE49]]
+; CHECK:       [[PRED_STORE_CONTINUE49]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP23]], i32 1
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF65:.*]], label %[[PRED_STORE_CONTINUE66:.*]]
-; CHECK:       [[PRED_STORE_IF65]]:
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF50:.*]], label %[[PRED_STORE_CONTINUE51:.*]]
+; CHECK:       [[PRED_STORE_IF50]]:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[PREDPHI58]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP31]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE66]]
-; CHECK:       [[PRED_STORE_CONTINUE66]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
-; CHECK-NEXT:    [[TMP33:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
-; CHECK-NEXT:    [[TMP34:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP32]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE51]]
+; CHECK:       [[PRED_STORE_CONTINUE51]]:
 ; CHECK-NEXT:    [[TMP35:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[TMP33]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = or <2 x i1> [[TMP22]], [[TMP34]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = or <2 x i1> [[TMP23]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
-; CHECK-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF67:.*]], label %[[PRED_STORE_CONTINUE68:.*]]
-; CHECK:       [[PRED_STORE_IF67]]:
-; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19:![0-9]+]]
-; CHECK-NEXT:    store i64 [[TMP45]], ptr [[DST]], align 8, !alias.scope [[META20:![0-9]+]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE68]]
-; CHECK:       [[PRED_STORE_CONTINUE68]]:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
-; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF69:.*]], label %[[PRED_STORE_CONTINUE70:.*]]
-; CHECK:       [[PRED_STORE_IF69]]:
-; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP39]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE70]]
-; CHECK:       [[PRED_STORE_CONTINUE70]]:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0
-; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF71:.*]], label %[[PRED_STORE_CONTINUE72:.*]]
-; CHECK:       [[PRED_STORE_IF71]]:
-; CHECK-NEXT:    [[TMP41:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP41]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE72]]
-; CHECK:       [[PRED_STORE_CONTINUE72]]:
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF52:.*]], label %[[PRED_STORE_CONTINUE53:.*]]
+; CHECK:       [[PRED_STORE_IF52]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19:![0-9]+]]
+; CHECK-NEXT:    store i64 [[TMP24]], ptr [[DST]], align 8, !alias.scope [[META20:![0-9]+]], !noalias [[META19]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE53]]
+; CHECK:       [[PRED_STORE_CONTINUE53]]:
 ; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1
-; CHECK-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF73:.*]], label %[[PRED_STORE_CONTINUE74]]
-; CHECK:       [[PRED_STORE_IF73]]:
-; CHECK-NEXT:    [[TMP43:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP43]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE74]]
-; CHECK:       [[PRED_STORE_CONTINUE74]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF54:.*]], label %[[PRED_STORE_CONTINUE55]]
+; CHECK:       [[PRED_STORE_IF54]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
+; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE55]]
+; CHECK:       [[PRED_STORE_CONTINUE55]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP46]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -348,6 +288,10 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[X_GEP:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[X_GEP]], align 8
+; CHECK-NEXT:    [[Y_GEP:%.*]] = getelementptr i32, ptr [[Y_PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[Y_GEP]], align 4
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i64 [[X]], 0
 ; CHECK-NEXT:    br i1 [[C1]], label %[[THEN_4:.*]], label %[[THEN_1:.*]]
 ; CHECK:       [[THEN_1]]:
@@ -386,6 +330,10 @@ entry:
 
 loop.header:
   %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %x.gep = getelementptr i64, ptr %x.ptr, i64 %iv
+  %x = load i64, ptr %x.gep
+  %y.gep = getelementptr i32, ptr %y.ptr, i64 %iv
+  %y = load i32, ptr %y.gep
   %c1 = icmp eq i64 %x, 0
   br i1 %c1, label %then.4, label %then.1
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index d42e6af..56a468e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -819,8 +819,8 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
 ; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index bf27f9e..f9cc195 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -333,10 +333,10 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[SHL:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index dec124b..a550f1c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -170,8 +170,8 @@ entry:
 ; VF_2-LABEL: Checking a loop in 'i64_factor_8'
 ; VF_2:         Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 393ee8d..9dceb01 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -50,12 +50,12 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc nuw i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP7]]
-; CHECK-NEXT:    [[DOTCAST7:%.*]] = trunc nuw i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST7]]
+; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 12
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -171,10 +171,10 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST6:%.*]] = trunc nuw i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 528e202..6d57f21 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -105,9 +105,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 8
 ; CHECK-VS1-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS1-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
-; CHECK-VS1-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VS1-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 8
+; CHECK-VS1-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-VS1-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -127,7 +127,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS1:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-VS1-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK-VS1:       [[WHILE_BODY]]:
 ; CHECK-VS1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -213,9 +213,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
 ; CHECK-VS2-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS2-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
-; CHECK-VS2-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VS2-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 4
+; CHECK-VS2-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VS2-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -235,7 +235,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS2:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-VS2-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK-VS2:       [[WHILE_BODY]]:
 ; CHECK-VS2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -428,9 +428,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
index 771dd000..0ff98d2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
@@ -19,8 +19,8 @@ define i64 @mul_select_operand_known_1_via_scev() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[VEC_PHI]])
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 0cea16d..3d4f7e0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -137,9 +137,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; VSCALEFORTUNING2-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VSCALEFORTUNING2:       scalar.ph:
-; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT11:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; VSCALEFORTUNING2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; VSCALEFORTUNING2-NEXT:    br label [[LOOP:%.*]]
 ; VSCALEFORTUNING2:       loop:
@@ -260,9 +260,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP29]], i32 [[TMP47]]
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
 ; PRED-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
 ; PRED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP44]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
 ; PRED-NEXT:    br label [[LOOP1:%.*]]
 ; PRED:       loop:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index a426cdf..a83c62b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -23,9 +23,9 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[IDX]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[DOTSPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 6ecaff0..cb4fd04 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -512,9 +512,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[A2]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[A1]], i32 0
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -540,9 +540,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -583,9 +583,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -607,9 +607,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -649,9 +649,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
@@ -684,9 +684,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 7d058a6..295c065 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -380,9 +380,9 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 10000, [[TMP24]]
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 10000, [[N_MOD_VF2]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 5f09431..90ef2da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -16,10 +16,10 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i7>
@@ -72,7 +72,7 @@ for.end:                                          ; preds = %for.body
 
 ; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
 ; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext  ir<%indvars.iv1294> to i64
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext ir<%indvars.iv1294> to i64
 
 define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i3_zext(
@@ -82,10 +82,10 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i3>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 0924992..bf95622 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -467,9 +467,9 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP1]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -553,9 +553,9 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP9]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1248,9 +1248,9 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
@@ -1339,10 +1339,10 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP11]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i64> [[TMP9]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP10]], splat (i64 3)
@@ -1449,9 +1449,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP8]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], [[DOTNEG]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
@@ -1492,9 +1492,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP33:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index 64b69be..322f96f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -19,12 +19,12 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index d81cfbf..1f7d0b7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -11,7 +11,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; VPLANS-LABEL: Checking a loop in 'simple_memset'
 ; VPLANS:      VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
 ; VPLANS-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; VPLANS-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; VPLANS:      vp<[[TC:%[0-9]+]]> = original trip-count
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: ir-bb<entry>:
 ; VPLANS-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (1 umax %n)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index e5b9812..75b2df9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -215,9 +215,9 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index de15004..603bd98 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -17,6 +17,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%start.1> + vp<[[VEC_TC]]> * ir<8>
+; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%start.2> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   <x1> vector loop: {
@@ -55,11 +57,12 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[START_2:%.*]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
@@ -150,9 +153,9 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 4b096e1..881de8d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -25,10 +25,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP26:%.*]] = shl i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP26]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -138,12 +138,12 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -229,12 +229,12 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
index 3ef99ff..c119248 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -45,10 +45,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi [[VTC]], ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %iv.next, %N
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -75,7 +76,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]]
 ; CHECK-NEXT:     [[VECP_IDX:vp.*]] = vector-pointer [[GEP_IDX]]
 ; CHECK-NEXT:     WIDEN [[IDX:.*]] = load [[VECP_IDX]]
-; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext  [[IDX]] to i64
+; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64
 ; CHECK-NEXT:     WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
 ; CHECK-NEXT:     WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1>
 ; CHECK-NEXT:     EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]]
@@ -90,10 +91,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi [[VTC]], ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %iv.next, %N
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index 8ac46fe..0c246c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -43,10 +43,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -88,10 +89,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -138,6 +140,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
@@ -183,10 +186,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -232,6 +236,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
@@ -277,10 +282,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
index bd2e5dc..0bc3ea9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -8,14 +8,14 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'zext_i8_i16'
-; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = zext i8 %0 to i32
 ; CHECK-LABEL: define void @zext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -85,14 +85,14 @@ exit:                                 ; preds = %for.body
 
 define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'sext_i8_i16'
-; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
 ; CHECK-LABEL: define void @sext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 648f6e8..a119707 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -41,10 +41,11 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %cmp = icmp ne i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -86,10 +87,11 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %cmp = icmp ne i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index 7b18e5c..b229103 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -27,7 +27,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
 ; CHECK: Cost of 0 for VF 2: vp<%4> = vector-pointer ir<%arrayidx>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%4>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 20 for VF 2: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 26 for VF 2: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
@@ -44,7 +44,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
 ; CHECK: Cost of 0 for VF 4: vp<%4> = vector-pointer ir<%arrayidx>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%4>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
@@ -61,7 +61,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
 ; CHECK: Cost of 0 for VF 8: vp<%4> = vector-pointer ir<%arrayidx>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%4>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 36 for VF 8: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
@@ -144,15 +144,15 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%0> = load vp<[[VEC_PTR]]>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 26 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 2: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 22 for VF 2: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<%0>
@@ -176,15 +176,15 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 4: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<%0>
@@ -208,15 +208,15 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%0> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 8: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 4 for VF 8: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}
@@ -240,15 +240,15 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 16: WIDEN ir<%0> = load vp<[[VEC_PTR]]>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 16: WIDEN ir<%1> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 16: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 8 for VF 16: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index 976c6a9..551b85b 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -17,8 +17,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
@@ -58,8 +58,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
@@ -99,8 +99,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12  for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
@@ -140,23 +140,23 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_2'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0
@@ -181,8 +181,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_4:          Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2
@@ -263,23 +263,23 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_2'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0
@@ -309,30 +309,30 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0
@@ -361,30 +361,30 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0
@@ -413,9 +413,9 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_3'
 ; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
@@ -427,16 +427,16 @@ entry:
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0
@@ -465,30 +465,30 @@ entry:
 ; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_3'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0
@@ -517,9 +517,9 @@ entry:
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_4:          Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2
@@ -621,30 +621,30 @@ entry:
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_3'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0
@@ -677,37 +677,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_4-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_4:         Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_4'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0
@@ -740,37 +740,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0
@@ -803,10 +803,10 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_4'
 ; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
@@ -821,19 +821,19 @@ entry:
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0
@@ -866,37 +866,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_4'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0
@@ -1055,37 +1055,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_4'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index 79ced9a..2f1af79 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -108,7 +108,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END27:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
@@ -117,40 +117,40 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF24]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND30:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT31:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI32:%.*]] = phi <2 x i64> [ [[TMP55]], %[[VEC_EPILOG_PH]] ], [ [[TMP56:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX29]], 0
-; CHECK-NEXT:    [[NEXT_GEP33:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[NEXT_GEP33]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <2 x i8>, ptr [[TMP58]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[WIDE_LOAD34]] to <2 x i64>
-; CHECK-NEXT:    [[TMP60:%.*]] = shl <2 x i64> [[VEC_IND30]], splat (i64 1)
-; CHECK-NEXT:    [[TMP61:%.*]] = shl <2 x i64> [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP56]] = or <2 x i64> [[TMP61]], [[VEC_PHI32]]
-; CHECK-NEXT:    [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT31]] = add <2 x i64> [[VEC_IND30]], splat (i64 2)
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP62]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND27:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT28:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI29:%.*]] = phi <2 x i64> [ [[TMP57]], %[[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP59:%.*]] = add i64 [[INDEX26]], 0
+; CHECK-NEXT:    [[NEXT_GEP30:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[NEXT_GEP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i8>, ptr [[TMP60]], align 1
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[WIDE_LOAD32]] to <2 x i64>
+; CHECK-NEXT:    [[TMP62:%.*]] = shl <2 x i64> [[VEC_IND27]], splat (i64 1)
+; CHECK-NEXT:    [[TMP63:%.*]] = shl <2 x i64> [[TMP61]], [[TMP62]]
+; CHECK-NEXT:    [[TMP58]] = or <2 x i64> [[TMP63]], [[VEC_PHI29]]
+; CHECK-NEXT:    [[INDEX_NEXT32]] = add nuw i64 [[INDEX26]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT28]] = add <2 x i64> [[VEC_IND27]], splat (i64 2)
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT32]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP56]])
-; CHECK-NEXT:    [[CMP_N36:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N36]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP58]])
+; CHECK-NEXT:    [[CMP_N33:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[CMP_N33]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL35:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END27]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX37:%.*]] = phi i64 [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL34:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX35:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX37]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL34]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL36]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[PTR_IV]], align 1
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP53]] to i64
@@ -164,7 +164,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP_I166_I]], i1 [[CMP2]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = icmp eq i64 [[RED_NEXT_LCSSA]], 0
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
index 130ef75..3d00c22 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
@@ -75,17 +75,17 @@ define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[SRC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT4]], i32 8, <vscale x 2 x i1> [[TMP8]], <vscale x 2 x ptr> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 2 x i1> [[VEC_PHI]], [[TMP9]]
@@ -100,8 +100,8 @@ define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[ANY_OF_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 11efac95..f2318d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -22,9 +22,9 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -116,9 +116,9 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -210,9 +210,9 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -314,9 +314,9 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -420,9 +420,9 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -534,9 +534,9 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -851,9 +851,9 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP15]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 2 x i64> [[TMP18]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP20]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 16c23cd..450405f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -29,10 +29,10 @@ define void @dead_load(ptr %p, i16 %start) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP18]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[START_EXT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
@@ -111,9 +111,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
@@ -334,9 +334,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i32> [[BROADCAST_SPLAT]], splat (i32 -1)
@@ -419,9 +419,9 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP9]], splat (i64 2)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
@@ -433,24 +433,24 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = zext <vscale x 4 x i8> [[TMP16]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> [[TMP18]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[TMP17]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index a7765f4..038e726 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -432,6 +432,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -439,7 +440,6 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
@@ -477,6 +477,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -486,10 +487,8 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
-; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP6]]
+; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
@@ -560,6 +559,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -567,7 +567,6 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
@@ -605,6 +604,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -614,10 +614,8 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
-; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP6]]
+; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index dd2e75f..58d6fd0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -28,8 +28,8 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 8131c7b..e4425a9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -63,13 +63,13 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[X_I64]], [[TMP49]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i32 [[DOTCAST]], 3
 ; CHECK-NEXT:    [[IND_END22:%.*]] = add i32 [[X_I32]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[X_I64]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
@@ -92,11 +92,11 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ], [ [[X_I64]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL14]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL13]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_I64:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = load i64, ptr [[GEP_I64]], align 8
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext i32 [[IV_CONV]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 5c5600b..10ac870 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -49,28 +49,26 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -78,16 +76,16 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -118,28 +116,26 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 5)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -147,16 +143,16 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -187,28 +183,26 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -216,16 +210,16 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -256,19 +250,17 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -285,7 +277,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -317,19 +309,17 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -346,7 +336,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -390,7 +380,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -407,7 +397,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 2c19aab..dc63072 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -38,9 +38,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
-; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
 ; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
@@ -117,9 +117,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
-; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV64-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV64-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
 ; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
new file mode 100644
index 0000000..93bd44f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
@@ -0,0 +1,39 @@
+; RUN: opt -passes=debugify,loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -S < %s 2>&1 | FileCheck --check-prefix=DEBUGLOC %s
+
+; Testing the debug locations of the generated vector intrinsic is same as
+; its scalar counterpart.
+
+define void @vp_select(ptr %a, ptr %b, ptr %c, i64 %N) {
+; DEBUGLOC-LABEL: define void @vp_select(
+; DEBUGLOC: vector.body:
+; DEBUGLOC:   = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> %{{.+}}, <vscale x 4 x i32> %{{.+}}, <vscale x 4 x i32> %{{.+}}, i32 %{{.+}}), !dbg ![[SELLOC:[0-9]+]]
+; DEBUGLOC: loop:
+; DEBUGLOC:   = select i1 %{{.+}}, i32 %{{.+}}, i32 %{{.+}}, !dbg ![[SELLOC]]
+;
+ entry:
+   br label %loop
+
+loop:
+   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+   %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+   %load.b = load i32, ptr %gep.b, align 4
+   %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+   %load.c = load i32, ptr %gep.c, align 4
+   %cmp = icmp sgt i32 %load.b, %load.c
+   %neg.c = sub i32 0, %load.c
+   %sel = select i1 %cmp, i32 %load.c, i32 %neg.c
+   %add = add i32 %sel, %load.b
+   %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+   store i32 %add, ptr %gep.a, align 4
+   %iv.next = add nuw nsw i64 %iv, 1
+   %exitcond = icmp eq i64 %iv.next, %N
+   br i1 %exitcond, label %exit, label %loop
+
+ exit:
+   ret void
+ }
+
+ ; DEBUGLOC: [[SELLOC]] = !DILocation(line: 9
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index eb60c24..951d833 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -64,6 +64,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -92,11 +94,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -181,11 +185,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
 ; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
-; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
 ; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -217,8 +220,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
@@ -311,6 +314,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -339,11 +344,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -428,11 +435,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
 ; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
-; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
 ; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -464,8 +470,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 375278e..3386a7d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -14,16 +14,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -31,13 +29,13 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -73,16 +71,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -90,13 +86,13 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 8395ffd..30cb33e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -86,9 +86,9 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 64)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
@@ -162,10 +162,10 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP18]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
@@ -328,7 +328,6 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -348,7 +347,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       loop:
 ; NOSTRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -606,7 +605,6 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -626,7 +624,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       loop:
 ; NOSTRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -731,12 +729,12 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]]
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]]
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP11]]
-; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; STRIDED:       vector.body:
 ; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index bfdcfbf..f38aa11 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -163,20 +163,14 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -194,7 +188,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[ADD]] = add i8 [[F_039]], 1
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[F_039]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -293,7 +287,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> [[BROADCAST_SPLAT6]], i32 8, <vscale x 2 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -314,7 +308,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[V]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -354,8 +348,7 @@ attributes #1 = { "target-features"="+64bit,+v" }
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
index 11cf832..f07aaec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
@@ -989,6 +989,62 @@ exit:
   ret void
 }
 
+; There's no @llvm.vp.log10, so don't transform it.
+define void @log10(ptr %a, ptr %b, i64 %N) {
+; IF-EVL-LABEL: define void @log10(
+; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[ENTRY:.*]]:
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-NEXT:    [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]])
+; IF-EVL-NEXT:    [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store float [[COND]], ptr [[GEP9]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; IF-EVL:       [[EXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @log10(
+; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]])
+; NO-VP-NEXT:    [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store float [[COND]], ptr [[GEP9]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[EXIT]]:
+; NO-VP-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds float, ptr %b, i64 %iv
+  %0 = load float, ptr %gep, align 4
+  %cond = tail call float @llvm.log10.f32(float %0)
+  %gep9 = getelementptr inbounds float, ptr %a, i64 %iv
+  store float %cond, ptr %gep9, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.smin.i32(i32, i32)
 declare i32 @llvm.umax.i32(i32, i32)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 209c251..f323231 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -20,10 +20,10 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -37,16 +37,16 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 0, [[TMP18]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 1, [[TMP18]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
@@ -121,10 +121,10 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 7efa65e..a2f85b9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -25,9 +25,9 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index e7eb577..cd1d734 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -63,11 +63,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: ir-bb<for.body>:
-; IF-EVL-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; IF-EVL-OUTLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-OUTLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; IF-EVL-OUTLOOP-NEXT: No successors
@@ -113,11 +114,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: ir-bb<for.body>:
-; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; IF-EVL-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; IF-EVL-INLOOP-NEXT: No successors
@@ -159,11 +161,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: ir-bb<for.body>:
-; NO-VP-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; NO-VP-OUTLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; NO-VP-OUTLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; NO-VP-OUTLOOP-NEXT: No successors
@@ -205,11 +208,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: ir-bb<for.body>:
-; NO-VP-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; NO-VP-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; NO-VP-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; NO-VP-INLOOP-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index 98245fc..2de0f7e 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -19,125 +19,103 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT3]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], splat (i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP10]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    store i1 [[TMP9]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK:       pred.store.if3:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2
 ; CHECK-NEXT:    store i1 [[TMP12]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
 ; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3
 ; CHECK-NEXT:    store i1 [[TMP14]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
 ; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4
 ; CHECK-NEXT:    store i1 [[TMP16]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
 ; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5
 ; CHECK-NEXT:    store i1 [[TMP18]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
-; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
 ; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6
 ; CHECK-NEXT:    store i1 [[TMP20]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
 ; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
 ; CHECK:       pred.store.if13:
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7
 ; CHECK-NEXT:    store i1 [[TMP22]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
 ; CHECK:       pred.store.continue14:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
-; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
 ; CHECK:       pred.store.if15:
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8
 ; CHECK-NEXT:    store i1 [[TMP24]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
 ; CHECK:       pred.store.continue16:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
-; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
 ; CHECK:       pred.store.if17:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9
 ; CHECK-NEXT:    store i1 [[TMP26]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
 ; CHECK:       pred.store.continue18:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
-; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
 ; CHECK:       pred.store.if19:
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10
 ; CHECK-NEXT:    store i1 [[TMP28]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
 ; CHECK:       pred.store.continue20:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
-; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
 ; CHECK:       pred.store.if21:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11
 ; CHECK-NEXT:    store i1 [[TMP30]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
 ; CHECK:       pred.store.continue22:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
 ; CHECK:       pred.store.if23:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12
 ; CHECK-NEXT:    store i1 [[TMP32]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
 ; CHECK:       pred.store.continue24:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
 ; CHECK:       pred.store.if25:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13
 ; CHECK-NEXT:    store i1 [[TMP34]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
 ; CHECK:       pred.store.continue26:
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
-; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
 ; CHECK:       pred.store.if27:
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14
 ; CHECK-NEXT:    store i1 [[TMP36]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
 ; CHECK:       pred.store.continue28:
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
-; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
 ; CHECK:       pred.store.if29:
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15
 ; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
 ; CHECK:       pred.store.continue30:
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
-; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]]
-; CHECK:       pred.store.if31:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    store i1 [[TMP40]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32]]
-; CHECK:       pred.store.continue32:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -156,7 +134,7 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    store i1 [[ICMP_SGT]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -183,7 +161,6 @@ exit:                                             ; preds = %for.body
 }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
index d0754f1..7b0fa64 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
@@ -66,8 +66,8 @@ define void @func_21() {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LV:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index fcf1ba0..61bcbaa 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -16,30 +16,24 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i64 0, 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 1, 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 2, 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 3, 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[TMP11]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
@@ -64,14 +58,13 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; CHECK:       [[PRED_STORE_IF5]]:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP31]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -93,7 +86,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -125,7 +118,6 @@ exit:
 }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 61cae9c..83e2f848 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -17,15 +17,11 @@ define void @f1() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i16 0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -40,7 +36,7 @@ define void @f1() {
 ; CHECK-NEXT:    store ptr [[_TMP2]], ptr [[_TMP7]], align 8
 ; CHECK-NEXT:    [[_TMP9]] = add nsw i16 [[C_1_0]], 1
 ; CHECK-NEXT:    [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
-; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index f319036..15bdbea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -42,8 +42,8 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 3, [[N_VEC3]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 6a12be7..5c0aeb5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -495,8 +495,8 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 {
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP30]], i1 false, i1 false
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[ANY_OF:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANY_OF_NEXT:%.*]], [[LOOP]] ]
@@ -986,8 +986,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
new file mode 100644
index 0000000..3d23090
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i1 @fn(ptr %nno) #0 {
+; CHECK-LABEL: define i1 @fn(
+; CHECK-SAME: ptr [[NNO:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 10, i64 9, i64 8, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 10, [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 10)
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison)
+; CHECK-NEXT:    [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[REVERSE1]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP12]])
+; CHECK-NEXT:    br i1 true, label [[FOR_END36:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -2, [[MIDDLE_BLOCK]] ], [ 10, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY20:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC35:%.*]] ]
+; CHECK-NEXT:    [[SUM_01:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_1:%.*]], [[FOR_INC35]] ]
+; CHECK-NEXT:    [[REM4:%.*]] = and i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp eq i64 [[REM4]], 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    br i1 [[CMP21]], label [[IF_THEN22:%.*]], label [[FOR_INC35]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[TMP15]], 1
+; CHECK-NEXT:    [[REM27:%.*]] = urem i32 [[MUL]], 10
+; CHECK-NEXT:    br label [[FOR_INC35]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[REM27_PN:%.*]] = phi i32 [ [[REM27]], [[IF_THEN22]] ], [ [[TMP15]], [[FOR_BODY20]] ]
+; CHECK-NEXT:    [[SUM_1]] = or i32 [[REM27_PN]], [[SUM_01]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP19_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP19_NOT]], label [[FOR_END36]], label [[FOR_BODY20]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_INC35]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[CMP41:%.*]] = icmp eq i32 [[SUM_1_LCSSA]], 0
+; CHECK-NEXT:    ret i1 [[CMP41]]
+;
+entry:
+  br label %loop.header
+
+loop.header:                                       ; preds = %entry, %loop.latch
+  %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop.latch ]
+  %sum.01 = phi i32 [ 0, %entry ], [ %sum.1, %loop.latch ]
+  %rem4 = and i64 %iv, 1
+  %cmp21 = icmp eq i64 %rem4, 0
+  %gep = getelementptr inbounds nuw i32, ptr %nno, i64 %iv
+  %0 = load i32, ptr %gep, align 4
+  br i1 %cmp21, label %if.then, label %loop.latch
+
+if.then:                                        ; preds = %loop.header
+  %mul = shl i32 %0, 1
+  %rem27 = urem i32 %mul, 10
+  br label %loop.latch
+
+loop.latch:                                        ; preds = %loop.header, %if.then
+  %rem27.pn = phi i32 [ %rem27, %if.then ], [ %0, %loop.header ]
+  %sum.1 = or i32 %rem27.pn, %sum.01
+  %iv.next = add nsw i64 %iv, -1
+  %cmp19.not = icmp eq i64 %iv, 0
+  br i1 %cmp19.not, label %exit, label %loop.header
+
+exit:                                        ; preds = %loop.latch
+  %sum.1.lcssa = phi i32 [ %sum.1, %loop.latch ]
+  %cmp41 = icmp eq i32 %sum.1.lcssa, 0
+  ret i1 %cmp41
+}
+
+attributes #0 = { "target-features"="+avx" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index 5fb7df2..c14ddca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -60,8 +60,8 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 [[IV_START]], [[N_VEC4]]
@@ -171,11 +171,11 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 16
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 48
 ; CHECK-NEXT:    store <16 x i16> [[TMP4]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP5]], ptr [[TMP10]], align 2
-; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP21]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD_3]], [[TMP1]]
@@ -191,8 +191,8 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[L]], 8
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[L]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    [[DOTCAST7:%.*]] = trunc i64 [[N_VEC5]] to i16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 0e511cf..6fc7080 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -48,8 +48,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index ab0b454..fc6059d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -61,8 +61,8 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
 ; AUTO_VEC:       vec.epilog.ph:
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[N_VEC3:%.*]] = and i64 [[ZEXT]], 2147483644
 ; AUTO_VEC-NEXT:    [[DOTCAST5:%.*]] = uitofp nneg i64 [[N_VEC3]] to float
 ; AUTO_VEC-NEXT:    [[TMP7:%.*]] = fmul fast float [[DOTCAST5]], 5.000000e-01
@@ -441,8 +441,8 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
 ; AUTO_VEC:       vec.epilog.ph:
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[N_VEC6:%.*]] = and i64 [[TMP0]], 4294967292
 ; AUTO_VEC-NEXT:    [[DOTCAST8:%.*]] = uitofp nneg i64 [[N_VEC6]] to float
 ; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul reassoc float [[DOTCAST8]], 4.200000e+01
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index b6bccab..8c338d6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -665,16 +665,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AVX512:       vec.epilog.iter.check:
-; AVX512-NEXT:    [[TMP22:%.*]] = mul i64 [[N_VEC]], 64
-; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP22]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 4
 ; AVX512-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
+; AVX512-NEXT:    [[TMP38:%.*]] = mul i64 [[N_VEC]], 64
+; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]]
 ; AVX512-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; AVX512-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; AVX512:       vec.epilog.ph:
-; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP3]], 8
 ; AVX512-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF9]]
 ; AVX512-NEXT:    [[TMP24:%.*]] = mul i64 [[N_VEC10]], 4
@@ -691,12 +691,12 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP27]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD23]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD24]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 3b55044..68cbfad 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -348,10 +348,10 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -12, [[MIDDLE_BLOCK]] ], [ 100, [[VECTOR_MEMCHECK]] ], [ 100, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[VECTOR_MEMCHECK]] ], [ [[A]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
new file mode 100644
index 0000000..1dd2692
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: @wide_add_induction_step_live_in(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF:%.*]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i16 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[LOOP]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ADD_LCSSA]]
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ]
+  %add = add i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %add, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret i16 %add
+}
+
+define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: @wide_sub_induction_step_live_in(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF:%.*]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 -2, [[OFF]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP10]], ptr [[TMP9]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[SUB:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[SUB_LCSSA]]
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ]
+  %sub = sub i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sub, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret i16 %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index b277264..f50177e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -56,12 +56,12 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc nuw i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[DOTCAST9:%.*]] = trunc nuw i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST9]]
+; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 56
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -181,10 +181,10 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST8:%.*]] = trunc nuw i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 120
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index 597be33..9e87cc2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -368,9 +368,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 63
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT3]], <64 x i32> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -414,9 +414,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; AUTOVF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 7
 ; AUTOVF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; AUTOVF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
 ; AUTOVF-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
-; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT3]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AUTOVF-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
index bb7fe4d..1a9e7dd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
@@ -14,72 +14,70 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT16]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE15:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[PRED_SDIV_CONTINUE15]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_SDIV_CONTINUE15]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE17:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[PRED_SDIV_CONTINUE17]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_SDIV_CONTINUE17]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
 ; CHECK:       [[PRED_SDIV_IF]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
 ; CHECK:       [[PRED_SDIV_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SDIV_IF2:.*]], label %[[PRED_SDIV_CONTINUE3:.*]]
-; CHECK:       [[PRED_SDIV_IF2]]:
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE3]]
-; CHECK:       [[PRED_SDIV_CONTINUE3]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF4:.*]], label %[[PRED_SDIV_CONTINUE5:.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SDIV_IF4:.*]], label %[[PRED_SDIV_CONTINUE5:.*]]
 ; CHECK:       [[PRED_SDIV_IF4]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE5]]
 ; CHECK:       [[PRED_SDIV_CONTINUE5]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SDIV_IF6:.*]], label %[[PRED_SDIV_CONTINUE7:.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF6:.*]], label %[[PRED_SDIV_CONTINUE7:.*]]
 ; CHECK:       [[PRED_SDIV_IF6]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE7]]
 ; CHECK:       [[PRED_SDIV_CONTINUE7]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF8:.*]], label %[[PRED_SDIV_CONTINUE9:.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SDIV_IF8:.*]], label %[[PRED_SDIV_CONTINUE9:.*]]
 ; CHECK:       [[PRED_SDIV_IF8]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE9]]
 ; CHECK:       [[PRED_SDIV_CONTINUE9]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE7]] ], [ [[TMP8]], %[[PRED_SDIV_IF8]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
 ; CHECK:       [[PRED_SDIV_IF10]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE11]]
 ; CHECK:       [[PRED_SDIV_CONTINUE11]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP12]], %[[PRED_SDIV_IF10]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP8]], %[[PRED_SDIV_IF10]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
 ; CHECK:       [[PRED_SDIV_IF12]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE13]]
 ; CHECK:       [[PRED_SDIV_CONTINUE13]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP16]], %[[PRED_SDIV_IF12]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP12]], %[[PRED_SDIV_IF12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15:.*]]
 ; CHECK:       [[PRED_SDIV_IF14]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE15]]
 ; CHECK:       [[PRED_SDIV_CONTINUE15]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP20]], %[[PRED_SDIV_IF14]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP16]], %[[PRED_SDIV_IF14]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF16:.*]], label %[[PRED_SDIV_CONTINUE17]]
+; CHECK:       [[PRED_SDIV_IF16]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE17]]
+; CHECK:       [[PRED_SDIV_CONTINUE17]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE15]] ], [ [[TMP20]], %[[PRED_SDIV_IF16]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i16> zeroinitializer, <4 x i16> [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP24]] = or <4 x i1> [[VEC_PHI]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI1]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI3]], [[TMP22]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -91,8 +89,8 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[B:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index cc60359..7816c49 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -16,18 +16,16 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -43,7 +41,7 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 41868d6..d1c0201 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -85,9 +85,9 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VEC:       scalar.ph:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ]
+; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ]
 ; VEC-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ]
 ; VEC-NEXT:    br label [[FOR_BODY:%.*]]
 ; VEC:       for.body:
 ; VEC-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IDX:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index 052a963..28f8988 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -38,16 +38,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%4>
 ; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%5>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%4>
 ; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%arrayidx2>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%1> = load vp<%6>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext ir<%1> to i32
 ; CHECK: Cost of 0 for VF 2: WIDEN ir<%conv4> = and ir<%sum.013>, ir<255>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add> = add ir<%conv>, ir<%conv4>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add5> = add ir<%add>, ir<%conv3>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc  ir<%add5> to i8
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext  vp<%7> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc ir<%add5> to i8
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext vp<%7> to i32
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%0>
 ; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index 8d56c33..cfae26a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -15,19 +15,18 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
 ; CHECK:       [[PRED_UREM_IF]]:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[MUL]], [[X]]
 ; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE]]
 ; CHECK:       [[PRED_UREM_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[REM]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
 ; CHECK:       [[PRED_UREM_IF1]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = urem i64 [[MUL]], [[X]]
@@ -48,7 +47,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK:       [[PRED_UREM_CONTINUE6]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP9]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[P:%.*]] = select i1 [[TMP14]], i64 [[TMP12]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select i1 [[TMP15]], i64 [[TMP13]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 236ed30..a0294f7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -65,16 +65,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT99:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END9:%.*]] = add i64 8, [[TMP64]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF6:%.*]] = urem i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[N_VEC7:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[N_VEC7]], 2
@@ -112,16 +112,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 8, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       iter.check27:
+; CHECK:       iter.check23:
 ; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9
 ; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = add nuw i64 [[TMP27]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK25:%.*]] = icmp ult i64 [[TMP28]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH46:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK29:%.*]]
-; CHECK:       vector.main.loop.iter.check29:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH41:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK25:%.*]]
+; CHECK:       vector.main.loop.iter.check25:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK28:%.*]] = icmp ult i64 [[TMP28]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH47:%.*]], label [[VECTOR_PH30:%.*]]
-; CHECK:       vector.ph30:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH42:%.*]], label [[VECTOR_PH30:%.*]]
+; CHECK:       vector.ph26:
 ; CHECK-NEXT:    [[N_MOD_VF31:%.*]] = urem i64 [[TMP28]], 16
 ; CHECK-NEXT:    [[N_VEC32:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF31]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[N_VEC32]], 2
@@ -129,16 +129,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[TOBOOL6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY35:%.*]]
-; CHECK:       vector.body35:
-; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY35]] ]
-; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY35]] ]
-; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY35]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
+; CHECK:       vector.body29:
+; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i64> splat (i64 8), [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP31]], <16 x i64> [[TMP32]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP33]], i32 16, <16 x i1> [[TMP34]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = or disjoint <16 x i64> [[VEC_IND37]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP36:%.*]] = add nsw <16 x i64> [[TMP30]], [[TMP35]]
@@ -153,44 +153,44 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK24:%.*]], label [[VECTOR_BODY35]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block24:
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block20:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK48:%.*]]
-; CHECK:       vec.epilog.iter.check49:
-; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
+; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK43:%.*]]
+; CHECK:       vec.epilog.iter.check43:
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
+; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH46]], label [[VEC_EPILOG_PH47]]
-; CHECK:       vec.epilog.ph48:
-; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH41]], label [[VEC_EPILOG_PH42]]
+; CHECK:       vec.epilog.ph42:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
 ; CHECK-NEXT:    [[N_MOD_VF52:%.*]] = urem i64 [[TMP28]], 8
 ; CHECK-NEXT:    [[N_VEC53:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF52]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[N_VEC53]], 2
 ; CHECK-NEXT:    [[IND_END54:%.*]] = add i64 8, [[TMP43]]
 ; CHECK-NEXT:    [[IND_END57:%.*]] = mul i64 [[N_VEC53]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT50:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT50]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    [[DOTSPLATINSERT62:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL42]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT63:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT62]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION64:%.*]] = add <8 x i64> [[DOTSPLAT63]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT72:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT72]], <8 x i1> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY58:%.*]]
-; CHECK:       vec.epilog.vector.body58:
-; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH47]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
-; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH47]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
-; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH47]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY50:%.*]]
+; CHECK:       vec.epilog.vector.body52:
+; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP45]], <8 x i64> [[TMP46]], i64 0
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP47]], i32 16, <8 x i1> [[TMP48]])
 ; CHECK-NEXT:    [[TMP49:%.*]] = or disjoint <8 x i64> [[VEC_IND70]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP50:%.*]] = add nsw <8 x i64> [[TMP44]], [[TMP49]]
@@ -205,17 +205,17 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK46:%.*]], label [[VEC_EPILOG_VECTOR_BODY58]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       vec.epilog.middle.block46:
-; CHECK-NEXT:    [[CMP_N75:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[CMP_N75]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH46]]
-; CHECK:       vec.epilog.scalar.ph47:
-; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK46]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK48]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL59:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK46]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK48]] ]
+; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY50]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       vec.epilog.middle.block40:
+; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
+; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH41]]
+; CHECK:       vec.epilog.scalar.ph41:
+; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
 ; CHECK:       for.body.us:
-; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH46]] ]
-; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL59]], [[VEC_EPILOG_SCALAR_PH46]] ]
+; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH41]] ]
+; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL67]], [[VEC_EPILOG_SCALAR_PH41]] ]
 ; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw i64 [[TMP56]], [[INDVARS_IV70]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index 55ff26c..c9132ba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -142,8 +142,8 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_114]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY15:%.*]]
-; CHECK:       vector.body15:
-; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE28:%.*]] ]
+; CHECK:       vector.body14:
+; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE78:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX16]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX16]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
@@ -151,7 +151,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; CHECK:       pred.store.if21:
+; CHECK:       pred.store.if20:
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[OFFSET_IDX]]
@@ -160,10 +160,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]]
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; CHECK:       pred.store.continue22:
+; CHECK:       pred.store.continue21:
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; CHECK:       pred.store.if23:
+; CHECK:       pred.store.if22:
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
@@ -173,10 +173,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]]
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; CHECK:       pred.store.continue24:
+; CHECK:       pred.store.continue23:
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; CHECK:       pred.store.if25:
+; CHECK:       pred.store.if24:
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
@@ -186,10 +186,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]]
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.continue26:
+; CHECK:       pred.store.continue25:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3
-; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.if27:
+; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE78]]
+; CHECK:       pred.store.if26:
 ; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
@@ -198,8 +198,8 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]]
 ; CHECK-NEXT:    store i32 [[TMP49]], ptr [[TMP48]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.continue28:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE78]]
+; CHECK:       pred.store.continue27:
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC12]]
 ; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY15]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -273,7 +273,7 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT11]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE18:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[OFFSET_IDX6:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
@@ -290,38 +290,38 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
-; CHECK:       pred.store.if13:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 4
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[NEXT_GEP8]], align 16
 ; CHECK-NEXT:    store i32 [[TMP9]], ptr [[NEXT_GEP3]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
-; CHECK:       pred.store.if15:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[OFFSET_IDX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 8
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[NEXT_GEP9]], align 16
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[NEXT_GEP4]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
-; CHECK:       pred.store.continue16:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.if17:
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.if15:
 ; CHECK-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[OFFSET_IDX]], 12
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 12
 ; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[NEXT_GEP10]], align 16
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP5]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.continue16:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -470,8 +470,8 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
-; CHECK:       pred.store.if10:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 4
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
@@ -480,11 +480,11 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP9]] to i32
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 7
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[NEXT_GEP7]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
-; CHECK:       pred.store.continue11:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
-; CHECK:       pred.store.if12:
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP13:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
@@ -493,11 +493,11 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i16 [[TMP15]] to i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 7
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP8]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
-; CHECK:       pred.store.continue13:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.if14:
+; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE15]]
+; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP19:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 12
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
@@ -507,7 +507,7 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
 ; CHECK-NEXT:    store i32 [[TMP23]], ptr [[NEXT_GEP9]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.continue15:
+; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index b8dcfd3..8661d86f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 
 ; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
-; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
+; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double
+; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double
 define void @uint64_to_double_cost(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
index 4c95584..2fea016 100644
--- a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
+++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
@@ -171,11 +171,11 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 1), <4 x i64> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 1), <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index 6e7efe0..e11f77d 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -33,7 +33,7 @@
 ; CHECK:   br i1 {{.+}}, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !prof [[PROF_F0_VEC_EPILOG_VECTOR_BODY:![0-9]+]]
 ;
 ; CHECK: vec.epilog.middle.block:
-; CHECK:   br i1 %cmp.n9, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
+; CHECK:   br i1 %cmp.n{{.+}}, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
 ;
 ; CHECK: vec.epilog.scalar.ph:
 ; CHECK:   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
index 04ce956..0f34f62 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
@@ -11,22 +11,20 @@ define i32 @foo(ptr %p) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG3:![0-9]+]]
-; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG7:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG3]]
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG3]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]], !dbg [[DBG7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG8:![0-9]+]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG3]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG9:![0-9]+]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG3]]
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG7]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG14:![0-9]+]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG9]]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG7]]
+; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG8]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -63,17 +61,16 @@ exit:                              ; preds = %loop
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
 ; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}})
-; CHECK: [[DBG3]] = !DILocation(line: 4, scope: [[META4:![0-9]+]])
+; CHECK: [[DBG3]] = !DILocation(line: 6, scope: [[META4:![0-9]+]])
 ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]])
 ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]])
 ; CHECK: [[META6]] = !{}
-; CHECK: [[DBG7]] = !DILocation(line: 6, scope: [[META4]])
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
-; CHECK: [[META9]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META10]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]])
-; CHECK: [[DBG12]] = !DILocation(line: 5, scope: [[META4]])
-; CHECK: [[DBG13]] = !DILocation(line: 7, scope: [[META4]])
-; CHECK: [[DBG14]] = !DILocation(line: 8, scope: [[META4]])
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META9]]}
+; CHECK: [[DBG7]] = !DILocation(line: 5, scope: [[META4]])
+; CHECK: [[DBG8]] = !DILocation(line: 9, scope: [[META4]])
+; CHECK: [[DBG9]] = !DILocation(line: 4, scope: [[META4]])
+; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]])
+; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]])
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]}
+; CHECK: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 8baff7b..1333451 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -104,6 +104,83 @@ exit:
   ret void
 }
 
+define void @deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
 define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
@@ -150,7 +227,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -173,7 +250,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -251,7 +328,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -274,7 +351,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -352,7 +429,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -375,7 +452,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -453,7 +530,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -476,7 +553,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -550,7 +627,7 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr no
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -573,7 +650,7 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr no
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -653,7 +730,7 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr n
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -676,7 +753,7 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr n
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -757,7 +834,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -781,7 +858,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -854,7 +931,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -876,7 +953,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -949,7 +1026,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -971,7 +1048,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1027,7 +1104,7 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1049,7 +1126,7 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1123,7 +1200,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1145,7 +1222,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1218,7 +1295,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1240,7 +1317,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1313,7 +1390,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1335,7 +1412,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP31:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1397,4 +1474,6 @@ exit:
 ; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index c159ec8..94593a7 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -241,9 +241,9 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i1 [[BC_MERGE_RDX]], false
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
@@ -275,8 +275,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -426,8 +426,8 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) {
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC8]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK:%.*]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK:%.*]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i1 [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index 0eab97b..32d32a6 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -45,12 +45,13 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:  scalar.ph
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -125,13 +126,14 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_3_P:%.*]]>.2 = resume-phi vp<[[RESUME_3]]>.2, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
 ; CHECK-NEXT:    IR   %for.3 = phi i16 [ 33, %entry ], [ %for.2, %loop ] (extra operand: vp<[[RESUME_3_P]]>.2 from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -186,9 +188,9 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT:     vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]>
 ; CHECK-NEXT:     EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next>
-; CHECK-NEXT:     WIDEN-CAST ir<%for.x.prev> = trunc  vp<[[SPLICE_X]]> to i32
+; CHECK-NEXT:     WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32
 ; CHECK-NEXT:     EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
-; CHECK-NEXT:     WIDEN-CAST ir<%for.y.i64> = sext  vp<[[SPLICE_Y]]> to i64
+; CHECK-NEXT:     WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -205,12 +207,13 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
@@ -263,9 +266,9 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN ir<%for.x.next> = mul ir<%l>, ir<2>
 ; CHECK-NEXT:     EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next>
-; CHECK-NEXT:     WIDEN-CAST ir<%for.x.prev> = trunc  vp<[[SPLICE_X]]> to i32
+; CHECK-NEXT:     WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32
 ; CHECK-NEXT:     EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
-; CHECK-NEXT:     WIDEN-CAST ir<%for.y.i64> = sext  vp<[[SPLICE_Y]]> to i64
+; CHECK-NEXT:     WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -282,12 +285,13 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 7aedb21..fc71f8a 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -661,10 +661,10 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
 ; CHECK-NEXT:    br i1 true, label %End, label %scalar.ph
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ]
 ; CHECK-NEXT:    phi double [ [[TMP0]], %middle.block ], [ 0.000000e+00, %Entry ]
 ; CHECK-NEXT:    phi double [ [[TMP3]], %middle.block ], [ 0.000000e+00, %Entry ]
 ; CHECK-NEXT:    phi double [ [[VECTOR_RECUR_EXTRACT9]], %middle.block ], [ 0.000000e+00, %Entry ]
+; CHECK-NEXT:    %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ]
 ; CHECK:      End:
 ; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP0]], %middle.block ]
 ; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP3]], %middle.block ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index fe16e8c..253ecac 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -41,8 +41,8 @@ define void @can_sink_after_store(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -121,8 +121,8 @@ define void @sink_sdiv(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -202,8 +202,8 @@ define void @can_sink_with_additional_user(i32 %x, ptr %ptr, i64 %tc) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -387,9 +387,9 @@ define void @instruction_with_2_FOR_operands(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[BB74:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[BB13:%.*]]
 ; CHECK:       bb13:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[TMP60:%.*]], [[BB13]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
@@ -463,9 +463,9 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses(ptr noalias
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -554,9 +554,9 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses_chain(ptr n
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -852,8 +852,8 @@ define void @sink_dominance(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
@@ -935,8 +935,8 @@ define void @sink_dominance_2(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
@@ -1057,9 +1057,9 @@ define void @test_for_sink_instruction_after_same_incoming_1(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -1125,9 +1125,9 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index a127890..0b2e7fe 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -82,11 +82,12 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -172,11 +173,12 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -245,12 +247,13 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %and.red = phi i32 [ 1234, %entry ], [ %and.red.next, %loop ]
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
@@ -361,11 +364,12 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -458,11 +462,12 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %C = icmp sgt i32 %iv.next, %recur.next
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -503,6 +508,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<2> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -546,11 +552,12 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[END]]>, ir<2>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %.pn = phi i32 [ 0, %entry ], [ %l, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp ugt i64 %iv, 3
 ; CHECK-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 2d50f82..509b8f9 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -55,8 +55,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[SCALAR_BODY]] ]
@@ -111,8 +111,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[SCALAR_BODY]] ]
@@ -165,8 +165,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[SCALAR_BODY]] ]
@@ -265,8 +265,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup.loopexit:
@@ -334,8 +334,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup.loopexit:
@@ -398,8 +398,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       for.cond.cleanup.loopexit:
@@ -525,8 +525,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[SCALAR_BODY]] ]
@@ -598,8 +598,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP21:%.*]], [[SCALAR_BODY]] ]
@@ -669,8 +669,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP15:%.*]], [[SCALAR_BODY]] ]
@@ -912,8 +912,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -956,8 +956,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -1005,8 +1005,8 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -1780,8 +1780,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -1836,8 +1836,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -1888,8 +1888,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
@@ -2016,8 +2016,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP48:%.*]], [[FOR_BODY]] ]
@@ -2076,8 +2076,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -2143,8 +2143,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ]
@@ -2240,8 +2240,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
@@ -2299,8 +2299,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
@@ -2353,8 +2353,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
@@ -2685,8 +2685,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2816,8 +2816,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ]
@@ -2881,8 +2881,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2990,8 +2990,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -3189,8 +3189,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
@@ -3273,8 +3273,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; SINK-AFTER-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -3707,13 +3707,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
@@ -3726,7 +3721,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-IC-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-IC-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-IC:       exit:
 ; UNROLL-NO-IC-NEXT:    ret i32 0
 ;
@@ -3736,11 +3731,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; UNROLL-NO-VF-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3753,7 +3745,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-VF:       exit:
 ; UNROLL-NO-VF-NEXT:    ret i32 0
 ;
@@ -3763,13 +3755,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER:       vector.ph:
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; SINK-AFTER-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
@@ -3782,7 +3769,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; SINK-AFTER-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; SINK-AFTER-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; SINK-AFTER:       exit:
 ; SINK-AFTER-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index c4509e4..7db53d8 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -172,6 +172,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2:%.*]], true
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
@@ -184,7 +185,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]]
 ; UNROLL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
-; UNROLL-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
+; UNROLL-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL:       pred.store.if:
 ; UNROLL-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
@@ -192,10 +193,8 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL:       pred.store.continue3:
 ; UNROLL-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI]], 1
 ; UNROLL-NEXT:    [[TMP12:%.*]] = add i32 [[VEC_PHI1]], 1
-; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NEXT:    [[TMP14:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NEXT:    [[PREDPHI]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]]
-; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]]
+; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP12]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -244,6 +243,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2:%.*]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
@@ -256,7 +256,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP5]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
 ; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP8]], ptr [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
@@ -268,10 +268,8 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY:       pred.store.continue3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = add i32 [[VEC_PHI]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI1]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI]] = select i1 [[TMP12]], i32 [[VEC_PHI]], i32 [[TMP10]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP11]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP12]], i32 [[VEC_PHI1]], i32 [[TMP11]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -321,9 +319,10 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; VEC-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
-; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND_2:%.*]], i64 0
 ; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
@@ -351,7 +350,6 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.continue2:
 ; VEC-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 1)
-; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; VEC-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP17]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP16]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index ecb00d4..f553864 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -1,21 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -S | FileCheck %s
 
 ; int int_inc;
 ;
-;int induction_with_global(int init, int *restrict A, int N) {
+;void induction_with_global(int init, int *restrict A, int N) {
 ;  int x = init;
 ;  for (int i=0;i<N;i++){
 ;    A[i] = x;
 ;    x += int_inc;
 ;  }
-;  return x;
 ;}
 
-; CHECK-LABEL: @induction_with_global(
-; CHECK:       for.body.lr.ph:
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+
+@int_inc = common global i32 0, align 4
+
+define void @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N) {
+; CHECK-LABEL: define void @induction_with_global(
+; CHECK-SAME: i32 [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @int_inc, align 4
-; CHECK:       vector.ph:
-; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 %init, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INIT]], [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INIT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
@@ -24,53 +42,56 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP10]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-
-@int_inc = common global i32 0, align 4
-
-define i32 @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP0]], [[X_05]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
-  %cmp4 = icmp sgt i32 %N, 0
-  br i1 %cmp4, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
   %0 = load i32, ptr @int_inc, align 4
-  %1 = mul i32 %0, %N
   br label %for.body
 
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %x.05 = phi i32 [ %init, %for.body.lr.ph ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %x.05 = phi i32 [ %init, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.05, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %x.05
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %2 = add i32 %1, %init
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %2, %for.end.loopexit ]
-  ret i32 %x.0.lcssa
-}
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %iv.next.trunc, %N
+  br i1 %exitcond, label %exit, label %for.body
 
+exit:
+  ret void
+}
 
 ;int induction_with_loop_inv(int init, int *restrict A, int N, int M) {
 ;  int x = init;
@@ -83,82 +104,123 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;  return x;
 ;}
 
-; CHECK-LABEL: @induction_with_loop_inv(
-; CHECK:       vector.ph:
-; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 %x.011, i64 0
+define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-LABEL: define i32 @induction_with_loop_inv(
+; CHECK-SAME: i32 [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV15:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT16:%.*]], %[[OUTER_LATCH:.*]] ]
+; CHECK-NEXT:    [[J_012:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC5:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[X_011:%.*]] = phi i32 [ [[INIT]], %[[ENTRY]] ], [ [[X_0_LCSSA:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[DOTCAST]], [[J_012]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[X_011]], [[TMP1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[X_011]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 %j.012, i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[J_012]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[J_012]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP8]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[INNER_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[X_011]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[INNER:.*]]
+; CHECK:       [[INNER]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[X_18:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_18]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[X_18]], [[J_012]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[INNER_EXIT]], label %[[INNER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[INNER_EXIT]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[X_011]], [[INDVARS_IV15]]
+; CHECK-NEXT:    br label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[X_0_LCSSA]] = phi i32 [ [[TMP9]], %[[INNER_EXIT]] ]
+; CHECK-NEXT:    [[INC5]] = add nuw nsw i32 [[J_012]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT16]] = add i32 [[INDVARS_IV15]], [[N]]
+; CHECK-NEXT:    [[EXITCOND17:%.*]] = icmp eq i32 [[INC5]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND17]], label %[[EXIT:.*]], label %[[OUTER_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+;
 entry:
-  %cmp10 = icmp sgt i32 %M, 0
-  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %for.end6
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry
-  %cmp27 = icmp sgt i32 %N, 0
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.inc4, %for.cond1.preheader.lr.ph
-  %indvars.iv15 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next16, %for.inc4 ]
-  %j.012 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc5, %for.inc4 ]
-  %x.011 = phi i32 [ %init, %for.cond1.preheader.lr.ph ], [ %x.1.lcssa, %for.inc4 ]
-  br i1 %cmp27, label %for.body3.preheader, label %for.inc4
-
-for.body3.preheader:                              ; preds = %for.cond1.preheader
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-  %x.18 = phi i32 [ %add, %for.body3 ], [ %x.011, %for.body3.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  %j.012 = phi i32 [ 0, %entry ], [ %inc5, %outer.latch ]
+  %x.011 = phi i32 [ %init, %entry ], [ %x.1.lcssa, %outer.latch ]
+  br label %inner
+
+inner:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %x.18 = phi i32 [ %x.011, %outer.header ], [ %add, %inner ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.18, ptr %arrayidx, align 4
   %add = add nsw i32 %x.18, %j.012
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.inc4.loopexit, label %for.body3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %inner.ec = icmp eq i32 %iv.next.trunc, %N
+  br i1 %inner.ec, label %inner.exit, label %inner
 
-for.inc4.loopexit:                                ; preds = %for.body3
-  %0 = add i32 %x.011, %indvars.iv15
-  br label %for.inc4
+inner.exit:
+  %add.ivs  = add i32 %x.011, %outer.iv
+  br label %outer.latch
 
-for.inc4:                                         ; preds = %for.inc4.loopexit, %for.cond1.preheader
-  %x.1.lcssa = phi i32 [ %x.011, %for.cond1.preheader ], [ %0, %for.inc4.loopexit ]
+outer.latch:
+  %x.1.lcssa = phi i32 [ %add.ivs, %inner.exit ]
   %inc5 = add nuw nsw i32 %j.012, 1
-  %indvars.iv.next16 = add i32 %indvars.iv15, %N
-  %exitcond17 = icmp eq i32 %inc5, %M
-  br i1 %exitcond17, label %for.end6.loopexit, label %for.cond1.preheader
-
-for.end6.loopexit:                                ; preds = %for.inc4
-  %x.1.lcssa.lcssa = phi i32 [ %x.1.lcssa, %for.inc4 ]
-  br label %for.end6
+  %outer.iv.next = add i32 %outer.iv, %N
+  %outer.ec = icmp eq i32 %inc5, %M
+  br i1 %outer.ec, label %exit, label %outer.header
 
-for.end6:                                         ; preds = %for.end6.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
-  ret i32 %x.0.lcssa
+exit:
+  ret i32 %x.1.lcssa
 }
 
-
-; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
-; CHECK:       vector.ph:
-; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-LABEL: define void @non_primary_iv_loop_inv_trunc(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i64 [[STEP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[STEP]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
@@ -166,19 +228,38 @@ for.end6:                                         ; preds = %for.end6.loopexit,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    %index.next = add nuw i64 %index, 8
-; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[J]] to i32
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], [[STEP]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -197,22 +278,43 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: @iv_no_binary_op_in_descriptor(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+
+define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-LABEL: define void @iv_no_binary_op_in_descriptor(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    store <8 x i64> [[VEC_IND]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP3]], label %middle.block, label [[VECTOR_BODY]]
-
-define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT_P]] = phi i64 [ [[IV_NEXT]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT_P]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop.header
 
@@ -231,3 +333,162 @@ loop.latch:
 exit:
   ret void
 }
+
+define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_add_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[O_1]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ]
+  %add = add i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %add, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_sub_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 -2, [[OFF]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i16 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ]
+  %sub = sub i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sub, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 28c1c2a..96311de 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1962,6 +1962,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -1989,7 +1990,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP15]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2030,6 +2030,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483646
 ; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
+; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -2054,8 +2056,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; IND:       pred.udiv.continue2:
 ; IND-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ]
-; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
-; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP10]]
 ; IND-NEXT:    [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2097,7 +2097,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; UNROLL-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i1> [[TMP27]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2143,8 +2144,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL:       pred.udiv.continue8:
 ; UNROLL-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]]
-; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP21]], <2 x i32> [[WIDE_LOAD2]]
+; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
+; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP21]]
 ; UNROLL-NEXT:    [[TMP22]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NEXT:    [[TMP23]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2189,6 +2190,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2239,10 +2241,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL-NO-IC:       pred.udiv.continue8:
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = phi <2 x i32> [ [[TMP20]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]]
-; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP26]]
+; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP26]]
 ; UNROLL-NO-IC-NEXT:    [[TMP29]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP30]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2284,7 +2284,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[TMP47:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison, i1 poison, i1 poison>
+; INTERLEAVE-NEXT:    [[TMP48:%.*]] = shufflevector <4 x i1> [[TMP47]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE16:%.*]] ]
@@ -2366,8 +2367,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
 ; INTERLEAVE:       pred.udiv.continue16:
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ]
-; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP21]], <4 x i32> [[WIDE_LOAD]]
-; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP41]], <4 x i32> [[WIDE_LOAD2]]
+; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP21]]
+; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP41]]
 ; INTERLEAVE-NEXT:    [[TMP42]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; INTERLEAVE-NEXT:    [[TMP43]] = add <4 x i32> [[PREDPHI17]], [[VEC_PHI1]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -2773,8 +2774,8 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2844,8 +2845,8 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2910,8 +2911,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2981,8 +2982,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -3051,8 +3052,8 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -3122,8 +3123,8 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -5561,9 +5562,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i32 1
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -5781,9 +5782,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD7]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       exit:
@@ -6236,9 +6237,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6307,9 +6308,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IND:       scalar.ph:
-; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    br label [[LOOP:%.*]]
 ; IND:       loop:
 ; IND-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6382,9 +6383,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
-; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL:       loop:
 ; UNROLL-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6463,9 +6464,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6538,9 +6539,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE:       loop:
 ; INTERLEAVE-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index bb17580..5bc832f 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -14,6 +14,7 @@
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + vp<[[VEC_TC]]> * ir<1>
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {
@@ -76,6 +77,7 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<false> + vp<[[VEC_TC]]> * ir<true>
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {
@@ -116,11 +118,13 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: scalar.ph:
+; DBG-NEXT:  EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; DBG-NEXT:  EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<false>
 ; DBG-NEXT: Successor(s): ir-bb<loop.header>
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<loop.header>:
-; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-; DBG-NEXT:   IR   %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ]
+; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; DBG-NEXT:   IR   %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; DBG-NEXT:   IR   %d.next = xor i1 %d, true
 ; DBG-NEXT: No successors
 ; DBG-EMPTY:
@@ -222,11 +226,12 @@ exit:
 ; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: scalar.ph:
+; DBG-NEXT:  EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; DBG-NEXT:  EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; DBG-NEXT: Successor(s): ir-bb<loop>
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<loop>:
-; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; DBG-NEXT:   IR   %for = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
 ; DBG:        IR   %ec = icmp slt i32 %iv.next.trunc, %n
 ; DBG-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 0e7a68c..abd91d3 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -1508,9 +1508,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
index bc1c1bf..e8ad6a3 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
@@ -134,12 +134,12 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 3
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 0
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 88be9fa..aef25a0 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -401,8 +401,8 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[ARRAYIDX5_PROMOTED]], [[VECTOR_MEMCHECK]] ], [ [[ARRAYIDX5_PROMOTED]], [[FOR_BODY3_LR_PH]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
 ; CHECK-NEXT:    [[TMP20:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[FOR_BODY3]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
index 07ee589..681ffe9 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
@@ -40,8 +40,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC1-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF4IC1:       [[INNER_LOOP]]:
 ; CHECK-VF4IC1-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
@@ -114,8 +114,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC4-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF4IC4:       [[INNER_LOOP]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
@@ -189,8 +189,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
-; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF1IC4-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF1IC4:       [[INNER_LOOP]]:
 ; CHECK-VF1IC4-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 10b6d1f..482f731 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --replace-value-regex "!llvm.loop ![0-9]+" --version 5
 ; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck --check-prefixes=CHECK,VEC %s
-; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 < %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 < %s | FileCheck --check-prefixes=CHECK,INTERLEAVE %s
 
 define i32 @postinc(i32 %k)  {
 ; CHECK-LABEL: define i32 @postinc(
@@ -430,6 +430,39 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; VEC-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i64 [[IV_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i64 @iv_scalar_steps_and_outside_users(
+; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store i64 [[TMP0]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
+; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i64 [[IV_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -485,6 +518,42 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; VEC-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i32 [[IV_2_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i32 @iv_2_dead_in_loop_only_used_outside(
+; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store i64 [[TMP0]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 2004, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[IV_2_NEXT]] = add nuw i32 [[IV_2]], 2
+; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
+; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[IV_2_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -502,3 +571,606 @@ exit:
   %iv.2.lcssa = phi i32 [ %iv.2, %loop ]
   ret i32 %iv.2.lcssa
 }
+
+define i32 @postinc_sub(i32 %k)  {
+; CHECK-LABEL: define i32 @postinc_sub(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[K]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC]] = sub nsw i32 [[INC_PHI]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ %k, %entry ], [ %inc, %for.body ]
+  %inc = sub nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+define i32 @postinc_swapped_ops(i32 %k)  {
+; CHECK-LABEL: define i32 @postinc_swapped_ops(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 1, [[INC_PHI]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 1, %inc.phi
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+define i32 @postinc_not_iv_backedge_value(i32 %k)  {
+; VEC-LABEL: define i32 @postinc_not_iv_backedge_value(
+; VEC-SAME: i32 [[K:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[FOR_BODY:.*]]
+; VEC:       [[FOR_BODY]]:
+; VEC-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; VEC-NEXT:    [[INC_2:%.*]] = add i32 [[INC_PHI]], 2
+; VEC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; VEC-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[FOR_END]]:
+; VEC-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[INC_2_LCSSA]]
+;
+; INTERLEAVE-LABEL: define i32 @postinc_not_iv_backedge_value(
+; INTERLEAVE-SAME: i32 [[K:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[FOR_BODY:.*]]
+; INTERLEAVE:       [[FOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; INTERLEAVE-NEXT:    [[INC_2:%.*]] = add i32 [[INC_PHI]], 2
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[FOR_END]]:
+; INTERLEAVE-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[INC_2_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %inc.2 = add i32 %inc.phi, 2
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.2
+}
+
+define float @fp_postinc_use_fadd(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fadd(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT:    [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; VEC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> <float 0.000000e+00, float 1.000000e+00>, [[DOTSPLAT2]]
+; VEC-NEXT:    [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT:    [[ADD]] = fadd fast float [[FP_IV]], [[FPINC]]
+; VEC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret float [[ADD_LCSSA]]
+;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fadd(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = fadd fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT:    [[ADD]] = fadd fast float [[FP_IV]], [[FPINC]]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  store float %fp.iv, ptr %gep.A, align 4
+  %add = fadd fast float %fp.iv, %fpinc
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %add
+}
+
+define float @fp_postinc_use_fsub(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fsub(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT:    [[TMP1:%.*]] = fsub fast float [[INIT]], [[TMP0]]
+; VEC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> <float 0.000000e+00, float 1.000000e+00>, [[DOTSPLAT2]]
+; VEC-NEXT:    [[INDUCTION:%.*]] = fsub fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = fsub fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT:    [[ADD]] = fsub fast float [[FP_IV]], [[FPINC]]
+; VEC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret float [[ADD_LCSSA]]
+;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fsub(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = fsub fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = fsub fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT:    [[ADD]] = fsub fast float [[FP_IV]], [[FPINC]]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  store float %fp.iv, ptr %gep.A, align 4
+  %add = fsub fast float %fp.iv, %fpinc
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %add
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/121745.
+define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) {
+; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification(
+; VEC-SAME: ptr [[DST:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; VEC-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; VEC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; VEC-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 1
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
+; VEC-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP2]], align 2
+; VEC-NEXT:    [[TMP4:%.*]] = add i32 [[STEP_2]], [[TMP0]]
+; VEC-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP6]]
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; VEC-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VEC-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
+; VEC-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; VEC-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[E_EXIT]]:
+; VEC-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[RES]]
+;
+; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification(
+; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; INTERLEAVE-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP1]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
+; INTERLEAVE-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[E_EXIT]]:
+; INTERLEAVE-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %step.1 = sext i8 0 to i32
+  %step.2 = add nsw i32 %step.1, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i32 %step.2, %iv
+  %cmp.i = icmp slt i32 %iv.next, 8
+  br i1 %cmp.i, label %loop, label %e.exit
+
+e.exit:
+  %res = phi i32 [ %iv.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) {
+; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(
+; VEC-SAME: ptr [[DST:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; VEC-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; VEC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP_2]], i64 0
+; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
+; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VEC-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; VEC-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; VEC-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; VEC-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 1)
+; VEC-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[TMP4]]
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
+; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; VEC-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; VEC-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VEC-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
+; VEC-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[INC]]
+; VEC-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; VEC-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[E_EXIT]]:
+; VEC-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[RES]]
+;
+; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(
+; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; INTERLEAVE-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP1]], 1
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; INTERLEAVE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; INTERLEAVE-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[INC]]
+; INTERLEAVE-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[E_EXIT]]:
+; INTERLEAVE-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %step.1 = sext i8 0 to i32
+  %step.2 = add nsw i32 %step.1, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %inc = add i32 %iv, 1
+  %iv.next = add i32 %step.2, %inc
+  %cmp.i = icmp slt i32 %iv.next, 8
+  br i1 %cmp.i, label %loop, label %e.exit
+
+e.exit:
+  %res = phi i32 [ %iv.next, %loop ]
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 70199fa..1bfb341 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -500,8 +500,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
@@ -590,8 +590,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.ph:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT8]], <2 x i8> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll
index ed7762f..fe660a8 100644
--- a/llvm/test/Transforms/LoopVectorize/pr37248.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll
@@ -41,26 +41,26 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[START]], [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
-; CHECK-NEXT:    [[TMP11:%.*]] = add i16 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
+; CHECK-NEXT:    [[TMP12:%.*]] = add i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; CHECK:       pred.store.if2:
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; CHECK:       pred.store.continue3:
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP16]], i32 -1
 ; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP17]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index 4f47e66..a129a4b 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -6,25 +6,25 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C_1:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT1]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i1> poison, i1 [[C_2:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT3]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT5]], splat (i32 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ <i32 35902, i32 0>, [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> [[VEC_IND]]
 ; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> splat (i32 9), <2 x i32> [[PREDPHI]]
 ; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
index 3dafe827..a4b229d 100644
--- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
@@ -52,13 +52,13 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK:       vector.ph7:
 ; CHECK-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF8]]
-; CHECK-NEXT:    br label [[VECTOR_BODY11:%.*]]
-; CHECK:       vector.body9:
-; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY11]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY10:%.*]]
+; CHECK:       vector.body10:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY10]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !4, !noalias !7
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX12]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY11]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY10]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block4:
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[LOOP_3_LR_PH:%.*]], label [[SCALAR_PH5]]
@@ -69,8 +69,8 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[IDXPROM_I_I61:%.*]] = and i64 [[IV761_LCSSA]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I62:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[IDXPROM_I_I61]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK22:%.*]] = icmp ult i64 [[TMP3]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH21:%.*]], label [[VECTOR_MEMCHECK14:%.*]]
-; CHECK:       vector.memcheck14:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH22:%.*]], label [[VECTOR_MEMCHECK15:%.*]]
+; CHECK:       vector.memcheck15:
 ; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[IDXPROM_I_I61]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP12]], 4
@@ -78,22 +78,22 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[BOUND017:%.*]] = icmp ult ptr [[TMP1]], [[SCEVGEP16]]
 ; CHECK-NEXT:    [[BOUND118:%.*]] = icmp ult ptr [[ARRAYIDX_I_I62]], [[SCEVGEP15]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH21]], label [[VECTOR_PH23:%.*]]
-; CHECK:       vector.ph23:
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH22]], label [[VECTOR_PH24:%.*]]
+; CHECK:       vector.ph24:
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF24]]
-; CHECK-NEXT:    br label [[VECTOR_BODY26:%.*]]
-; CHECK:       vector.body26:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH23]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY26]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY27:%.*]]
+; CHECK:       vector.body27:
+; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH24]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY27]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !10, !noalias !13
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX29]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY26]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block20:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK21:%.*]], label [[VECTOR_BODY27]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       middle.block21:
 ; CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH21]]
-; CHECK:       scalar.ph21:
-; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK20]] ], [ 0, [[VECTOR_MEMCHECK14]] ], [ 0, [[LOOP_3_LR_PH]] ]
+; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH22]]
+; CHECK:       scalar.ph22:
+; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK21]] ], [ 0, [[VECTOR_MEMCHECK15]] ], [ 0, [[LOOP_3_LR_PH]] ]
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
 ; CHECK:       loop.2:
 ; CHECK-NEXT:    [[IV846:%.*]] = phi i64 [ [[IV_NEXT85:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL13]], [[SCALAR_PH5]] ]
@@ -105,7 +105,7 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[EXITCOND92_NOT:%.*]] = icmp eq i64 [[IV846]], [[IV]]
 ; CHECK-NEXT:    br i1 [[EXITCOND92_NOT]], label [[LOOP_3_LR_PH]], label [[LOOP_2]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       loop.3:
-; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH21]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
+; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH22]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I_I62]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_I_I653:%.*]] = getelementptr i32, ptr [[TMP2:%.*]], i64 [[IV93:%.*]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll
index 50e1807..24b9441 100644
--- a/llvm/test/Transforms/LoopVectorize/pr66616.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll
@@ -46,7 +46,7 @@ define void @pr66616(ptr %ptr) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[ADD3_LCSSA]], [[DOTCAST]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY7:%.*]]
-; CHECK:       vector.body5:
+; CHECK:       vector.body4:
 ; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH3]] ], [ [[INDEX_NEXT9:%.*]], [[VECTOR_BODY7]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
index 5052ba8..7b57624 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -91,7 +91,7 @@ define void @scalar_cast_dbg(ptr nocapture %a, i32 %start, i64 %k) {
 ; DEBUGLOC:   = trunc i64 %index to i32, !dbg [[CASTLOC:![0-9]+]]
 ;
 ; DEBUGLOC: loop:
-; DEBUGLOC-NOT:   %trunc.iv = trunc i64 %iv to i32, !dbg [[CASTLOC]]
+; DEBUGLOC:   %trunc.iv = trunc i64 %iv to i32, !dbg [[CASTLOC]]
 ;
 entry:
   br label %loop
@@ -109,6 +109,31 @@ exit:
   ret void
 }
 
+define void @widen_intrinsic_dbg(i64 %n, ptr %y, ptr %x) {
+; DEBUGLOC-LABEL: define void @widen_intrinsic_dbg(
+; DEBUGLOC: vector.body:
+; DEBUGLOC:   = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !dbg ![[INTRINSIC_LOC:[0-9]+]]
+; DEBUGLOC: loop:
+; DEBUGLOC:   = call float @llvm.sqrt.f32(float %{{.+}}), !dbg ![[INTRINSIC_LOC]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %load = load float, ptr %gep.y, align 4
+  %call = call float @llvm.sqrt.f32(float %load)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 !0 = !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.width", i32 4}
 ; CHECK-NOT: !{metadata !"llvm.loop.vectorize.width", i32 4}
@@ -116,3 +141,4 @@ exit:
 
 ; DEBUGLOC: ![[RESUMELOC]] = !DILocation(line: 2
 ; DEBUGLOC: ![[PTRIVLOC]] = !DILocation(line: 12
+; DEBUGLOC: ![[INTRINSIC_LOC]] = !DILocation(line: 44
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll
new file mode 100644
index 0000000..57f0dc2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -passes=debugify,loop-vectorize -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s -check-prefix DEBUGLOC
+
+; Testing the debug locations of the generated vector intstructions are same as
+; their scalar counterpart.
+
+define i32 @reduction_sum(ptr %A, ptr %B) {
+; DEBUGLOC-LABEL: define i32 @reduction_sum(
+; DEBUGLOC: vector.body:
+; DEBUGLOC:   = load <4 x i32>, ptr %{{.+}}, align 4, !dbg ![[LOADLOC:[0-9]+]]
+; DEBUGLOC:   = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %{{.+}}), !dbg ![[REDLOC:[0-9]+]]
+; DEBUGLOC: loop:
+; DEBUGLOC:   %[[LOAD:.+]] = load i32, ptr %{{.+}}, align 4, !dbg ![[LOADLOC]]
+; DEBUGLOC:   = add i32 %{{.+}}, %[[LOAD]], !dbg ![[REDLOC]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  %load = load i32, ptr %gep, align 4
+  %red.next = add i32 %red, %load
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 256
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %red.lcssa = phi i32 [ %red.next, %loop ]
+  ret i32 %red.lcssa
+}
+
+; DEBUGLOC: ![[LOADLOC]] = !DILocation(line: 5
+; DEBUGLOC: ![[REDLOC]] = !DILocation(line: 6
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-align.ll b/llvm/test/Transforms/LoopVectorize/reduction-align.ll
index 69e3e07..3216c92 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-align.ll
@@ -27,7 +27,7 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i16> [[BROADCAST_SPLAT]], [[VEC_PHI]]
@@ -36,12 +36,12 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
-; CHECK-NEXT:    store i16 [[TMP3]], ptr [[HBUF]], align 1
+; CHECK-NEXT:    store i16 [[TMP3]], ptr [[HBUF]], align 1, !alias.scope [[META6:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[HEIGHT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -51,7 +51,7 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[HBUF]], align 1
 ; CHECK-NEXT:    [[INC]] = add i32 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[HEIGHT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
index 2404213..ad2f9c6 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -733,8 +733,8 @@ define i32 @cond-uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX1:%.*]] = phi i32 [ [[ADD2:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -897,8 +897,8 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX1:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -1039,8 +1039,8 @@ define i32 @uncond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -1168,8 +1168,8 @@ define i32 @uncond_cond_uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[ADD3:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 8a8439f..ca971f1 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -89,13 +89,13 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[PREDPHI]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 94fce86..f136b0e 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -185,10 +185,10 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32
@@ -262,11 +262,11 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul float [[DOTCAST]], 2.000000e+00
 ; CHECK-NEXT:    [[IND_END:%.*]] = fadd float [[TMP4]], 0.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = uitofp <vscale x 4 x i32> [[TMP7]] to <vscale x 4 x float>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul <vscale x 4 x float> [[TMP8]], splat (float 2.000000e+00)
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 301526c..550e52d 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1006,11 +1006,11 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC1-NEXT:    [[TMP2]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1048,6 +1048,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1055,13 +1056,9 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
-; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP2]]
-; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[TMP8]] = or <4 x i1> [[VEC_PHI3]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1100,6 +1097,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
 ; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1107,13 +1105,9 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP1]]
-; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP2]]
-; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
index 5e281929..836115f 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
@@ -17,8 +17,8 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[EXTRA_ITER]], 3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[EXTRA_ITER]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -39,8 +39,8 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[NEXT:%.*]] = phi i32 [ [[SEL:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 7590bb9..4ba9cc6 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -281,12 +281,12 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 276
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
index 40b007e..57bc7b8 100644
--- a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
@@ -19,8 +19,8 @@ define void @pr75298_store_reduction_value_in_folded_loop(i64 %iv.start) optsize
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
index 31732f0..892ddcc 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
@@ -10,12 +10,12 @@ define void @tail_fold_switch(ptr %dst, i32 %0) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
index 1a8f29e..b427b43 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -47,10 +47,11 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<e2>:
@@ -129,10 +130,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
@@ -204,10 +206,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
new file mode 100644
index 0000000..2520613
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
@@ -0,0 +1,52 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug %s 2>&1 | FileCheck %s
+
+
+; CHECK-LABEL: LV: Checking a loop in 'latch_exit_cannot_compute_btc_due_to_step'
+; CHECK: 	   LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Early exit is not the latch predecessor.
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
+
+; CHECK-LABEL: LV: Checking a loop in 'header_exit_cannot_compute_btc_due_to_step'
+; CHECK:       LV: Found an induction variable.
+; CHECK-NEXT:  LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Cannot determine exact exit count for latch block.
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
+
+; CHECK-NOT: vector.body
+define void @latch_exit_cannot_compute_btc_due_to_step(ptr %dst, i64 %step) {
+entry:
+  br label %loop
+
+loop:                                   ; preds = %loop, %for.cond.us
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, %step
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @header_exit_cannot_compute_btc_due_to_step(ptr %dst, i64 %step) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i64 %iv, %step
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop.latch, label %exit
+
+loop.latch:
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  br label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 72011ca..7f5e0f3a 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -133,11 +133,11 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_IND]], <4 x i64> undef
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index fd75177..85b44a7 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -1,19 +1,138 @@
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF8UF1 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=CHECK,VF8UF2 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF16UF1 %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF8UF1 %s
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF8UF2 %s
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF16UF1 %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ; Check if the vector loop condition can be simplified to true for a given
 ; VF/IC combination.
 define void @test_tc_less_than_16(ptr %A, i64 %N) {
-; CHECK-LABEL: define void @test_tc_less_than_16(
-; VF8UF1:       [[CMP:%.+]] = icmp eq i64 %index.next, %n.vec
-; VF8UF1-NEXT:  br i1 [[CMP]], label %middle.block, label %vector.body
+; VF8UF1-LABEL: define void @test_tc_less_than_16(
+; VF8UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[AND]], 8
+; VF8UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
 ;
-; VF8UF2:       br i1 true, label %middle.block, label %vector.body
+; VF8UF2-LABEL: define void @test_tc_less_than_16(
+; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF2-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
 ;
-; VF16UF1:      br i1 true, label %middle.block, label %vector.body
+; VF16UF1-LABEL: define void @test_tc_less_than_16(
+; VF16UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF16UF1-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
 ;
 entry:
   %and = and i64 %N, 15
@@ -33,3 +152,1118 @@ loop:
 exit:
   ret void
 }
+
+define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, 10) %N) {
+; VF8UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1:       [[PRED_STORE_IF]]:
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1:       [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF1:       [[PRED_STORE_IF1]]:
+; VF8UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1:       [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1:       [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1:       [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1:       [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1:       [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF2-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF2-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2:       [[PRED_STORE_IF]]:
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP5]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2:       [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF2:       [[PRED_STORE_IF1]]:
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP7]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF2:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF2:       [[PRED_STORE_IF3]]:
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP9]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF2:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF2:       [[PRED_STORE_IF5]]:
+; VF8UF2-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP11]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF2:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF2-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF2:       [[PRED_STORE_IF7]]:
+; VF8UF2-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP13]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF2:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF2-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF2:       [[PRED_STORE_IF9]]:
+; VF8UF2-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP15]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF2:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF2-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF2:       [[PRED_STORE_IF11]]:
+; VF8UF2-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP17]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF2:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF2-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF2:       [[PRED_STORE_IF13]]:
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP19]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF2:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF8UF2:       [[PRED_STORE_IF15]]:
+; VF8UF2-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 10
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP21]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF2:       [[PRED_STORE_CONTINUE16]]:
+; VF8UF2-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF8UF2:       [[PRED_STORE_IF17]]:
+; VF8UF2-NEXT:    [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 11
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP23]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF8UF2:       [[PRED_STORE_CONTINUE18]]:
+; VF8UF2-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF8UF2:       [[PRED_STORE_IF19]]:
+; VF8UF2-NEXT:    [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 12
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP25]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF8UF2:       [[PRED_STORE_CONTINUE20]]:
+; VF8UF2-NEXT:    [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF8UF2:       [[PRED_STORE_IF21]]:
+; VF8UF2-NEXT:    [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 13
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP27]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF8UF2:       [[PRED_STORE_CONTINUE22]]:
+; VF8UF2-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF8UF2:       [[PRED_STORE_IF23]]:
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 14
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP29]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF8UF2:       [[PRED_STORE_CONTINUE24]]:
+; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF8UF2:       [[PRED_STORE_IF25]]:
+; VF8UF2-NEXT:    [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 15
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP31]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF8UF2:       [[PRED_STORE_CONTINUE26]]:
+; VF8UF2-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF8UF2:       [[PRED_STORE_IF27]]:
+; VF8UF2-NEXT:    [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 16
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP33]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF8UF2:       [[PRED_STORE_CONTINUE28]]:
+; VF8UF2-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF8UF2:       [[PRED_STORE_IF29]]:
+; VF8UF2-NEXT:    [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 17
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP35]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF2-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF16UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
+; VF16UF1-NEXT:    [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0
+; VF16UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1:       [[PRED_STORE_IF]]:
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1:       [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT:    [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1
+; VF16UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF16UF1:       [[PRED_STORE_IF1]]:
+; VF16UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF16UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF16UF1-NEXT:    [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1:       [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1:       [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1:       [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1:       [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1:       [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1:       [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 10
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP20]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1:       [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT:    [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 11
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP22]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1:       [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 12
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP24]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1:       [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT:    [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 13
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP26]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1:       [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 14
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP28]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1:       [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 15
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP30]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1:       [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT:    [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 16
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP32]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1:       [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT:    [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 17
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP34]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF16UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF16UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr i16, ptr %dst, i64 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias %src, ptr %dst) {
+; VF8UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF8UF1:       [[OUTER_HEADER]]:
+; VF8UF1-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP0]]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF1-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF1-NEXT:    br label %[[INNER:.*]]
+; VF8UF1:       [[INNER]]:
+; VF8UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF1:       [[OUTER_LATCH]]:
+; VF8UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF8UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF2-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF8UF2:       [[OUTER_HEADER]]:
+; VF8UF2-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF8UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 0
+; VF8UF2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF2-NEXT:    br label %[[INNER:.*]]
+; VF8UF2:       [[INNER]]:
+; VF8UF2-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF2-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF2-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF2-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2:       [[OUTER_LATCH]]:
+; VF8UF2-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF2-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF8UF2-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF16UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF16UF1:       [[OUTER_HEADER]]:
+; VF16UF1-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF16UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF16UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 0
+; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF16UF1-NEXT:    br label %[[INNER:.*]]
+; VF16UF1:       [[INNER]]:
+; VF16UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF16UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF16UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1:       [[OUTER_LATCH]]:
+; VF16UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF16UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF16UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi ptr [ %src, %entry ], [ %outer.iv.next, %outer.latch ]
+  br label %inner
+
+inner:
+  %inner.iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %gep.src = getelementptr i8, ptr %outer.iv, i64 %inner.iv
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr i8, ptr %dst, i64 %inner.iv
+  store i8 %l, ptr %gep.dst, align 1
+  %iv.next = add i64 %inner.iv, 1
+  %c.1 = icmp eq i64 %iv.next, %N
+  br i1 %c.1, label %outer.latch, label %inner
+
+outer.latch:
+  %outer.iv.next = getelementptr i8, ptr %outer.iv, i64 1
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %outer.header, label %exit
+
+exit:
+  ret void
+}
+
+declare void @llvm.assume(i1)
+
+; Test case for https://github.com/llvm/llvm-project/issues/121897.
+define void @scev_expand_step(i64 %x, ptr %dst) {
+; VF8UF1-LABEL: define void @scev_expand_step(
+; VF8UF1-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF8UF1-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF8UF1-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF8UF1-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF8UF1-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 7
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF8UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1:       [[PRED_STORE_IF]]:
+; VF8UF1-NEXT:    [[TMP5:%.*]] = mul i64 0, [[STEP]]
+; VF8UF1-NEXT:    [[TMP6:%.*]] = add i64 0, [[TMP5]]
+; VF8UF1-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1:       [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF1:       [[PRED_STORE_IF1]]:
+; VF8UF1-NEXT:    [[TMP10:%.*]] = mul i64 1, [[STEP]]
+; VF8UF1-NEXT:    [[TMP11:%.*]] = add i64 0, [[TMP10]]
+; VF8UF1-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP13]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF1-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP15:%.*]] = mul i64 2, [[STEP]]
+; VF8UF1-NEXT:    [[TMP16:%.*]] = add i64 0, [[TMP15]]
+; VF8UF1-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1:       [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT:    [[TMP20:%.*]] = mul i64 3, [[STEP]]
+; VF8UF1-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF8UF1-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP22]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP23]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1:       [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT:    [[TMP25:%.*]] = mul i64 4, [[STEP]]
+; VF8UF1-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF8UF1-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP28]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT:    [[TMP29:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1:       [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT:    [[TMP30:%.*]] = mul i64 5, [[STEP]]
+; VF8UF1-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF8UF1-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP33]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1:       [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT:    [[TMP35:%.*]] = mul i64 6, [[STEP]]
+; VF8UF1-NEXT:    [[TMP36:%.*]] = add i64 0, [[TMP35]]
+; VF8UF1-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP37]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP38]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT:    [[TMP39:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1:       [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
+; VF8UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
+; VF8UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @scev_expand_step(
+; VF8UF2-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF8UF2-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF8UF2-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF8UF2-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF8UF2-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 15
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF8UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2:       [[PRED_STORE_IF]]:
+; VF8UF2-NEXT:    [[TMP6:%.*]] = mul i64 0, [[STEP]]
+; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 0, [[TMP6]]
+; VF8UF2-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP9]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2:       [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF2:       [[PRED_STORE_IF1]]:
+; VF8UF2-NEXT:    [[TMP11:%.*]] = mul i64 1, [[STEP]]
+; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
+; VF8UF2-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP14]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF2:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF2-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF2:       [[PRED_STORE_IF3]]:
+; VF8UF2-NEXT:    [[TMP16:%.*]] = mul i64 2, [[STEP]]
+; VF8UF2-NEXT:    [[TMP17:%.*]] = add i64 0, [[TMP16]]
+; VF8UF2-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP19]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF2:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF2:       [[PRED_STORE_IF5]]:
+; VF8UF2-NEXT:    [[TMP21:%.*]] = mul i64 3, [[STEP]]
+; VF8UF2-NEXT:    [[TMP22:%.*]] = add i64 0, [[TMP21]]
+; VF8UF2-NEXT:    [[TMP23:%.*]] = add i64 [[TMP22]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP24]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF2:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF2-NEXT:    [[TMP25:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF2:       [[PRED_STORE_IF7]]:
+; VF8UF2-NEXT:    [[TMP26:%.*]] = mul i64 4, [[STEP]]
+; VF8UF2-NEXT:    [[TMP27:%.*]] = add i64 0, [[TMP26]]
+; VF8UF2-NEXT:    [[TMP28:%.*]] = add i64 [[TMP27]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP28]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP29]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF2:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF2:       [[PRED_STORE_IF9]]:
+; VF8UF2-NEXT:    [[TMP31:%.*]] = mul i64 5, [[STEP]]
+; VF8UF2-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP31]]
+; VF8UF2-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP33]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP34]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF2:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF2-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP35]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF2:       [[PRED_STORE_IF11]]:
+; VF8UF2-NEXT:    [[TMP36:%.*]] = mul i64 6, [[STEP]]
+; VF8UF2-NEXT:    [[TMP37:%.*]] = add i64 0, [[TMP36]]
+; VF8UF2-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP38]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP39]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF2:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF2-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF2:       [[PRED_STORE_IF13]]:
+; VF8UF2-NEXT:    [[TMP41:%.*]] = mul i64 7, [[STEP]]
+; VF8UF2-NEXT:    [[TMP42:%.*]] = add i64 0, [[TMP41]]
+; VF8UF2-NEXT:    [[TMP43:%.*]] = add i64 [[TMP42]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP43]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP44]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF2:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF2-NEXT:    [[TMP45:%.*]] = extractelement <8 x i1> [[TMP4]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP45]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF8UF2:       [[PRED_STORE_IF15]]:
+; VF8UF2-NEXT:    [[TMP46:%.*]] = mul i64 8, [[STEP]]
+; VF8UF2-NEXT:    [[TMP47:%.*]] = add i64 0, [[TMP46]]
+; VF8UF2-NEXT:    [[TMP48:%.*]] = add i64 [[TMP47]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP48]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP49]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF2:       [[PRED_STORE_CONTINUE16]]:
+; VF8UF2-NEXT:    [[TMP50:%.*]] = extractelement <8 x i1> [[TMP4]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP50]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF8UF2:       [[PRED_STORE_IF17]]:
+; VF8UF2-NEXT:    [[TMP51:%.*]] = mul i64 9, [[STEP]]
+; VF8UF2-NEXT:    [[TMP52:%.*]] = add i64 0, [[TMP51]]
+; VF8UF2-NEXT:    [[TMP53:%.*]] = add i64 [[TMP52]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP53]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP54]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF8UF2:       [[PRED_STORE_CONTINUE18]]:
+; VF8UF2-NEXT:    [[TMP55:%.*]] = extractelement <8 x i1> [[TMP4]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP55]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF8UF2:       [[PRED_STORE_IF19]]:
+; VF8UF2-NEXT:    [[TMP56:%.*]] = mul i64 10, [[STEP]]
+; VF8UF2-NEXT:    [[TMP57:%.*]] = add i64 0, [[TMP56]]
+; VF8UF2-NEXT:    [[TMP58:%.*]] = add i64 [[TMP57]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP58]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP59]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF8UF2:       [[PRED_STORE_CONTINUE20]]:
+; VF8UF2-NEXT:    [[TMP60:%.*]] = extractelement <8 x i1> [[TMP4]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF8UF2:       [[PRED_STORE_IF21]]:
+; VF8UF2-NEXT:    [[TMP61:%.*]] = mul i64 11, [[STEP]]
+; VF8UF2-NEXT:    [[TMP62:%.*]] = add i64 0, [[TMP61]]
+; VF8UF2-NEXT:    [[TMP63:%.*]] = add i64 [[TMP62]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP63]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP64]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF8UF2:       [[PRED_STORE_CONTINUE22]]:
+; VF8UF2-NEXT:    [[TMP65:%.*]] = extractelement <8 x i1> [[TMP4]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP65]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF8UF2:       [[PRED_STORE_IF23]]:
+; VF8UF2-NEXT:    [[TMP66:%.*]] = mul i64 12, [[STEP]]
+; VF8UF2-NEXT:    [[TMP67:%.*]] = add i64 0, [[TMP66]]
+; VF8UF2-NEXT:    [[TMP68:%.*]] = add i64 [[TMP67]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP68]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP69]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF8UF2:       [[PRED_STORE_CONTINUE24]]:
+; VF8UF2-NEXT:    [[TMP70:%.*]] = extractelement <8 x i1> [[TMP4]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP70]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF8UF2:       [[PRED_STORE_IF25]]:
+; VF8UF2-NEXT:    [[TMP71:%.*]] = mul i64 13, [[STEP]]
+; VF8UF2-NEXT:    [[TMP72:%.*]] = add i64 0, [[TMP71]]
+; VF8UF2-NEXT:    [[TMP73:%.*]] = add i64 [[TMP72]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP74:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP73]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP74]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF8UF2:       [[PRED_STORE_CONTINUE26]]:
+; VF8UF2-NEXT:    [[TMP75:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP75]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF8UF2:       [[PRED_STORE_IF27]]:
+; VF8UF2-NEXT:    [[TMP76:%.*]] = mul i64 14, [[STEP]]
+; VF8UF2-NEXT:    [[TMP77:%.*]] = add i64 0, [[TMP76]]
+; VF8UF2-NEXT:    [[TMP78:%.*]] = add i64 [[TMP77]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP78]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP79]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF8UF2:       [[PRED_STORE_CONTINUE28]]:
+; VF8UF2-NEXT:    [[TMP80:%.*]] = extractelement <8 x i1> [[TMP4]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP80]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF8UF2:       [[PRED_STORE_IF29]]:
+; VF8UF2-NEXT:    [[TMP81:%.*]] = mul i64 15, [[STEP]]
+; VF8UF2-NEXT:    [[TMP82:%.*]] = add i64 0, [[TMP81]]
+; VF8UF2-NEXT:    [[TMP83:%.*]] = add i64 [[TMP82]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP83]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP84]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF2-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @scev_expand_step(
+; VF16UF1-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF16UF1-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF16UF1-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF16UF1-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF16UF1-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 15
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = extractelement <16 x i1> [[TMP3]], i32 0
+; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1:       [[PRED_STORE_IF]]:
+; VF16UF1-NEXT:    [[TMP5:%.*]] = mul i64 0, [[STEP]]
+; VF16UF1-NEXT:    [[TMP6:%.*]] = add i64 0, [[TMP5]]
+; VF16UF1-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1:       [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP3]], i32 1
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF16UF1:       [[PRED_STORE_IF1]]:
+; VF16UF1-NEXT:    [[TMP10:%.*]] = mul i64 1, [[STEP]]
+; VF16UF1-NEXT:    [[TMP11:%.*]] = add i64 0, [[TMP10]]
+; VF16UF1-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP13]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF16UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF16UF1-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP3]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP15:%.*]] = mul i64 2, [[STEP]]
+; VF16UF1-NEXT:    [[TMP16:%.*]] = add i64 0, [[TMP15]]
+; VF16UF1-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP3]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1:       [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT:    [[TMP20:%.*]] = mul i64 3, [[STEP]]
+; VF16UF1-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF16UF1-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP22]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP23]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP3]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1:       [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT:    [[TMP25:%.*]] = mul i64 4, [[STEP]]
+; VF16UF1-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF16UF1-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP28]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP3]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1:       [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT:    [[TMP30:%.*]] = mul i64 5, [[STEP]]
+; VF16UF1-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF16UF1-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP33]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP3]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1:       [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT:    [[TMP35:%.*]] = mul i64 6, [[STEP]]
+; VF16UF1-NEXT:    [[TMP36:%.*]] = add i64 0, [[TMP35]]
+; VF16UF1-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP37]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP38]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP3]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1:       [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
+; VF16UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
+; VF16UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP3]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1:       [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT:    [[TMP45:%.*]] = mul i64 8, [[STEP]]
+; VF16UF1-NEXT:    [[TMP46:%.*]] = add i64 0, [[TMP45]]
+; VF16UF1-NEXT:    [[TMP47:%.*]] = add i64 [[TMP46]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP47]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP48]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT:    [[TMP49:%.*]] = extractelement <16 x i1> [[TMP3]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP49]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1:       [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT:    [[TMP50:%.*]] = mul i64 9, [[STEP]]
+; VF16UF1-NEXT:    [[TMP51:%.*]] = add i64 0, [[TMP50]]
+; VF16UF1-NEXT:    [[TMP52:%.*]] = add i64 [[TMP51]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP52]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP53]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT:    [[TMP54:%.*]] = extractelement <16 x i1> [[TMP3]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP54]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1:       [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT:    [[TMP55:%.*]] = mul i64 10, [[STEP]]
+; VF16UF1-NEXT:    [[TMP56:%.*]] = add i64 0, [[TMP55]]
+; VF16UF1-NEXT:    [[TMP57:%.*]] = add i64 [[TMP56]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP57]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP58]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT:    [[TMP59:%.*]] = extractelement <16 x i1> [[TMP3]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP59]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1:       [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT:    [[TMP60:%.*]] = mul i64 11, [[STEP]]
+; VF16UF1-NEXT:    [[TMP61:%.*]] = add i64 0, [[TMP60]]
+; VF16UF1-NEXT:    [[TMP62:%.*]] = add i64 [[TMP61]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP62]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP63]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP3]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1:       [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT:    [[TMP65:%.*]] = mul i64 12, [[STEP]]
+; VF16UF1-NEXT:    [[TMP66:%.*]] = add i64 0, [[TMP65]]
+; VF16UF1-NEXT:    [[TMP67:%.*]] = add i64 [[TMP66]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP67]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP68]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT:    [[TMP69:%.*]] = extractelement <16 x i1> [[TMP3]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP69]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1:       [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT:    [[TMP70:%.*]] = mul i64 13, [[STEP]]
+; VF16UF1-NEXT:    [[TMP71:%.*]] = add i64 0, [[TMP70]]
+; VF16UF1-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP72]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP73]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT:    [[TMP74:%.*]] = extractelement <16 x i1> [[TMP3]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP74]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1:       [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT:    [[TMP75:%.*]] = mul i64 14, [[STEP]]
+; VF16UF1-NEXT:    [[TMP76:%.*]] = add i64 0, [[TMP75]]
+; VF16UF1-NEXT:    [[TMP77:%.*]] = add i64 [[TMP76]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP77]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP78]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT:    [[TMP79:%.*]] = extractelement <16 x i1> [[TMP3]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP79]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1:       [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT:    [[TMP80:%.*]] = mul i64 15, [[STEP]]
+; VF16UF1-NEXT:    [[TMP81:%.*]] = add i64 0, [[TMP80]]
+; VF16UF1-NEXT:    [[TMP82:%.*]] = add i64 [[TMP81]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP82]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP83]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF16UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  %c = icmp eq i64 %x, 65536
+  call void @llvm.assume(i1 %c)
+  %fr = freeze i64 %x
+  %step = add i64 %fr, -65534
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, %step
+  %gep.dst = getelementptr i8, ptr %dst, i64 %iv.next
+  store i8 0, ptr %gep.dst, align 1
+  %ec = icmp slt i64 %iv.next, 16
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+;.
+; VF8UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+;.
+; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 930d3cd..791c995 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -499,16 +499,13 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 0, [[G_64]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 0, [[G_64]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 0, [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3
 ; CHECK-NEXT:    store <4 x i16> splat (i16 -1), ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -521,7 +518,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    store i16 [[G_16]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -558,6 +555,5 @@ exit:
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 95855e8..be1eb78 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -35,10 +35,11 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop.header>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:         IR   %iv.next = add i64 %iv, 1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index 484e1ea..dd3b50b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -14,7 +14,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: ir-bb<vector.ph>:
 ; CHECK-NEXT:   IR %n.mod.vf = urem i64 %0, 2
 ; CHECK-NEXT:   IR %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:   IR %ind.end = getelementptr i8, ptr %start, i64 %n.vec
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + ir<%n.vec> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -94,7 +94,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi ir<%ind.end>, ir<%start>
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index f07d1af..beb305f 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -19,6 +19,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + vp<[[VTC]]> * ir<-1>
+; CHECK-NEXT:   vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + vp<[[VTC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -43,11 +45,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%and>
+; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%A>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:        IR   %cmp = icmp eq i64 %iv.next, 0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -57,7 +61,6 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ;
 ; CHECK: Executing best plan with VF=8, UF=2
 ; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' {
-; CHECK-NEXT: Live-in ir<[[VFxUF:.+]]> = VF * UF
 ; CHECK-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
@@ -69,29 +72,23 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: ir-bb<vector.ph>:
 ; CHECK-NEXT:  IR   %n.mod.vf = urem i64 %and, 16
 ; CHECK-NEXT:  IR   %n.vec = sub i64 %and, %n.mod.vf
-; CHECK-NEXT:  IR   %ind.end = sub i64 %and, %n.vec
-; CHECK-NEXT:  IR   %ind.end1 = getelementptr i8, ptr %A, i64 %n.vec
-; CHECK-NEXT: Successor(s): vector loop
+; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + ir<[[VTC]]> * ir<-1>
+; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + ir<[[VTC]]> * ir<1>
+; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:     vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
-; CHECK-NEXT:     vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR1]]>
-; CHECK-NEXT:     WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
-; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
-; CHECK-NEXT:     WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
-; CHECK-NEXT:     vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR3]]>, ir<%add>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR4]]>, ir<%add>.1
-; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, ir<[[VFxUF]]>
-; CHECK-NEXT:     EMIT branch-on-cond ir<true>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
+; CHECK-NEXT:   vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:   vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
+; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[VPTR1]]>
+; CHECK-NEXT:   WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
+; CHECK-NEXT:   WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:   WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
+; CHECK-NEXT:   vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:   vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
+; CHECK-NEXT:   WIDEN store vp<[[VPTR3]]>, ir<%add>
+; CHECK-NEXT:   WIDEN store vp<[[VPTR4]]>, ir<%add>.1
 ; CHECK-NEXT: Successor(s): ir-bb<middle.block>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<middle.block>:
@@ -103,8 +100,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%and>
-; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end1>, ir<%A>
+; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%and>
+; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%A>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index cc2bd4e..5c09ce2 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -42,10 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -112,10 +113,11 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -180,11 +182,12 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -246,11 +249,12 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %red = phi float [ %red.next, %for.body ], [ 0.000000e+00, %entry ]
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
@@ -332,10 +336,11 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+; CHECK-NEXT:    IR   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %cmp = icmp ult i64 %i, 5
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -381,6 +386,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * ir<4>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -412,10 +418,11 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %cmp = icmp slt i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -494,12 +501,13 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-; CHECK-NEXT:    IR   %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
+; CHECK-NEXT:    IR   %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] (extra operand: vp<[[RED_RESUME]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -588,10 +596,11 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %cmp1 = icmp slt i32 %lsd, 100
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -648,6 +657,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * vp<[[EXP_SCEV]]>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -671,10 +681,11 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): ir-bb<loop.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %iv.next = add i64 %iv, %inc
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -738,10 +749,11 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %ec = icmp eq i32 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -805,6 +817,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
@@ -873,10 +886,11 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -961,10 +975,11 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %ifcond = fcmp oeq float %ld.value, 5.0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -1038,10 +1053,11 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -1152,11 +1168,12 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
index a939b1e..b6391e0 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
@@ -16,6 +16,7 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<0> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -53,11 +54,13 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
-; CHECK-NEXT:    IR   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %tmp5 = trunc i32 %tmp4 to i8
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 5343451..aa05bb1 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -262,6 +262,7 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<21> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.A.uniform> = getelementptr inbounds ir<%A>, ir<0>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -1046,6 +1047,7 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -1086,10 +1088,11 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%n>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv.next = add nsw i32 %iv, -1
 ; CHECK-NEXT:   IR   %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
 ; CHECK-NEXT:   IR   %l = load i32, ptr %gep.src, align 16
@@ -1134,6 +1137,7 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -1177,10 +1181,11 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 -1
 ; CHECK-NEXT:   IR   %l = load i8, ptr %ptr.iv.next, align 1
 ; CHECK-NEXT:   IR   %c.1 = icmp eq i8 %l, 0
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 39b90ad..65d78f4 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -803,6 +803,19 @@ define void @byval_param_noalias_metadata(ptr align 4 byval(i32) %ptr) {
   ret void
 }
 
+define void @byval_param_profile_metadata(ptr align 4 byval(i32) %ptr) {
+; CHECK-LABEL: @byval_param_profile_metadata(
+; CHECK-NEXT:    store i32 1, ptr [[PTR2:%.*]], align 4
+; CHECK-NEXT:    call void @f_byval(ptr byval(i32) align 4 [[PTR2]]), !prof [[PROF3:![0-9]+]], !memprof [[META4:![0-9]+]], !callsite [[META7:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %tmp = alloca i32, align 4
+  store i32 1, ptr %ptr
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp, ptr align 4 %ptr, i64 4, i1 false)
+  call void @f_byval(ptr align 4 byval(i32) %tmp), !memprof !3, !callsite !6, !prof !7
+  ret void
+}
+
 define void @memcpy_memory_none(ptr %p, ptr %p2, i64 %size) {
 ; CHECK-LABEL: @memcpy_memory_none(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[SIZE:%.*]], i1 false) #[[ATTR7:[0-9]+]]
@@ -897,3 +910,8 @@ define void @memcpy_immut_escape_after(ptr align 4 noalias %val) {
 !0 = !{!0}
 !1 = !{!1, !0}
 !2 = !{!1}
+!3 = !{!4}
+!4 = !{!5, !"cold"}
+!5 = !{i64 123, i64 456}
+!6 = !{i64 123}
+!7 = !{!"branch_weights", i32 10}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll
new file mode 100644
index 0000000..759d511
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll
@@ -0,0 +1,159 @@
+;; Test recursion handling during cloning.
+;;
+;; Original code looks like:
+;;
+;; #include <stdlib.h>
+;; #include <string.h>
+;; #include <unistd.h>
+;; __attribute((noinline)) char *D() {
+;;   return new char[10];
+;; }
+;; __attribute((noinline)) char *B(int n);
+;; __attribute((noinline)) char *C(int n) {
+;;   if (!n) {
+;;     return D();
+;;   }
+;;   return B(n-1);
+;; }
+;; __attribute((noinline)) char *B(int n) {
+;;   return C(n);
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = B(1);
+;;   char *y = B(1);
+;;   char *z = B(0);
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   free(x);
+;;   sleep(200);
+;;   free(y);
+;;   free(z);
+;;   return 0;
+;; }
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+;; By default we should enable cloning of contexts involved with recursive
+;; cycles, but not through the cycle itself. I.e. until full support for
+;; recursion is added, the cloned recursive call from C back to B (line 12) will
+;; not be updated to call a clone.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS
+
+;; Skipping recursive callsites should result in no cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-callsites=false \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="marked with memprof allocation attribute cold" \
+; RUN:  --check-prefix=ALL
+
+;; Skipping recursive contexts should prevent spurious call to cloned version of
+;; B from the context starting at memprof_recursive.cc:19:13, which is actually
+;; recursive (until that support is added).
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-contexts=false \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS
+
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+;; We should only call the cold clone for the recursive context if we enabled
+;; recursive contexts via -memprof-allow-recursive-contexts=true (default).
+; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+;; We should call the original B for the recursive context if we have
+;; disabled recursive contexts via -memprof-allow-recursive-contexts=false.
+; SKIP-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:12:10: call in clone _Z1Ci assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:18:13: call in clone main assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi assigned to call function clone _Z1Ci
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci assigned to call function clone _Z1Dv
+; ALL: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Dv() !dbg !3 {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !dbg !6, !memprof !7, !callsite !14
+  ret ptr null
+}
+
+define ptr @_Z1Ci(i32 %n) !dbg !15 {
+entry:
+  %call = tail call ptr @_Z1Dv(), !dbg !16, !callsite !17
+  br label %return
+
+if.end:                                           ; No predecessors!
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !18, !callsite !19
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Bi(i32 %n) !dbg !20 {
+entry:
+  %call = tail call ptr @_Z1Ci(i32 0), !dbg !21, !callsite !22
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = tail call ptr @_Z1Bi(i32 0), !dbg !23, !callsite !25
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !26, !callsite !27
+  %call2 = tail call ptr @_Z1Bi(i32 0), !dbg !28, !callsite !29
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 7aec6dc477f8148ed066d10dfc7a012a51b6599c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_recursive.cc", directory: ".", checksumkind: CSK_MD5, checksum: "2f15f63b187a0e0d40e7fdd18b10576a")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "D", linkageName: "_Z1Dv", scope: !1, file: !1, line: 4, type: !4, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 5, column: 10, scope: !3)
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 6307901912192269588}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 8632435727821051414}
+!12 = !{!13, !"cold"}
+!13 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 -3421689549917153178}
+!14 = !{i64 6541423618768552252}
+!15 = distinct !DISubprogram(name: "C", linkageName: "_Z1Ci", scope: !1, file: !1, line: 8, type: !4, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DILocation(line: 10, column: 12, scope: !15)
+!17 = !{i64 -200552803509692312}
+!18 = !DILocation(line: 12, column: 10, scope: !15)
+!19 = !{i64 -7155190423157709404}
+!20 = distinct !DISubprogram(name: "B", linkageName: "_Z1Bi", scope: !1, file: !1, line: 14, type: !4, scopeLine: 14, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = !DILocation(line: 15, column: 10, scope: !20)
+!22 = !{i64 -2954124005641725917}
+!23 = !DILocation(line: 18, column: 13, scope: !24)
+!24 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !4, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = !{i64 8632435727821051414}
+!26 = !DILocation(line: 19, column: 13, scope: !24)
+!27 = !{i64 -3421689549917153178}
+!28 = !DILocation(line: 20, column: 13, scope: !24)
+!29 = !{i64 6307901912192269588}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
new file mode 100644
index 0000000..9f3e09d
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
@@ -0,0 +1,806 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+%struct.cmplx_int16_t = type { i16, i16 }
+%struct.compressed_data_8bit = type { i8, [24 x i8] }
+
+define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 {
+; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(
+; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr nocapture noundef readonly [[SRC:%.*]], ptr nocapture noundef writeonly [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0
+; CHECK-NEXT:    br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[CMP31_NOT:%.*]] = icmp eq ptr [[SCALE]], null
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT58:%.*]] = zext i32 [[N_PRB]] to i64
+; CHECK-NEXT:    br i1 [[CMP31_NOT]], label %[[FOR_BODY_US:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY_US]]:
+; CHECK-NEXT:    [[INDVARS_IV55:%.*]] = phi i64 [ [[INDVARS_IV_NEXT56:%.*]], %[[FOR_BODY_US]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052_US:%.*]] = phi ptr [ [[DST_ADDR_1_US:%.*]], %[[FOR_BODY_US]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT:%.*]], ptr [[SRC]], i64 [[INDVARS_IV55]]
+; CHECK-NEXT:    [[MANTISSA_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MANTISSA_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I59_US:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 9
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I56_US:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 17
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I_US:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
+; CHECK-NEXT:    [[CONV_US:%.*]] = sext i8 [[TMP3]] to i16
+; CHECK-NEXT:    [[MUL_US:%.*]] = shl nsw i16 [[CONV_US]], 1
+; CHECK-NEXT:    [[VECINIT_I79_US:%.*]] = insertelement <8 x i16> poison, i16 [[MUL_US]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86_US:%.*]] = shufflevector <8 x i16> [[VECINIT_I79_US]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I59_US]]
+; CHECK-NEXT:    [[MUL_I74_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I56_US]]
+; CHECK-NEXT:    [[MUL_I_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I_US]]
+; CHECK-NEXT:    store <8 x i16> [[MUL_I87_US]], ptr [[DST_ADDR_052_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR47_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[MUL_I74_US]], ptr [[ADD_PTR47_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR50_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[MUL_I_US]], ptr [[ADD_PTR50_US]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1_US]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT56]] = add nuw nsw i64 [[INDVARS_IV55]], 1
+; CHECK-NEXT:    [[EXITCOND59_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT56]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND59_NOT]], label %[[FOR_END]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052:%.*]] = phi ptr [ [[DST_ADDR_1:%.*]], %[[FOR_BODY]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP_COERCE_050:%.*]] = phi i64 [ [[AGG_TMP_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_049:%.*]] = phi i64 [ [[AGG_TMP42_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_048:%.*]] = phi i64 [ [[AGG_TMP37_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT]], ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[MANTISSA:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[MANTISSA]], align 1
+; CHECK-NEXT:    [[VMOVL_I59:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 9
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[VMOVL_I56:%.*]] = sext <8 x i8> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 17
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15]], align 1
+; CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i16 [[CONV]], 1
+; CHECK-NEXT:    [[VECINIT_I79:%.*]] = insertelement <8 x i16> poison, i16 [[MUL]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86:%.*]] = shufflevector <8 x i16> [[VECINIT_I79]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I59]]
+; CHECK-NEXT:    [[MUL_I74:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I56]]
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I]]
+; CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP_COERCE_050]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP_COERCE_0_INSERT_MASK]], [[AGG_TMP_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL33:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I87]], i64 [[AGG_TMP_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    store <8 x i16> [[CALL33]], ptr [[DST_ADDR_052]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP37_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP37_COERCE_048]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP37_COERCE_0_INSERT_MASK]], [[AGG_TMP37_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL38:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I74]], i64 [[AGG_TMP37_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[CALL38]], ptr [[ARRAYIDX39]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP42_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP42_COERCE_049]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP42_COERCE_0_INSERT_MASK]], [[AGG_TMP42_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL43:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I]], i64 [[AGG_TMP42_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[CALL43]], ptr [[ARRAYIDX44]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %__p0.addr.i75 = alloca <8 x i16>, align 16
+  %__p1.addr.i76 = alloca i16, align 2
+  %__ret.i77 = alloca <8 x i16>, align 16
+  %.compoundliteral.i78 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <8 x i16>, align 16
+  %__p1.addr.i63 = alloca i16, align 2
+  %__ret.i64 = alloca <8 x i16>, align 16
+  %.compoundliteral.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i60 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca i16, align 2
+  %__ret.i61 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i57 = alloca <8 x i8>, align 8
+  %__ret.i58 = alloca <8 x i16>, align 16
+  %__p0.addr.i54 = alloca <8 x i8>, align 8
+  %__ret.i55 = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i8>, align 8
+  %__ret.i = alloca <8 x i16>, align 16
+  %n_prb.addr = alloca i32, align 4
+  %src.addr = alloca ptr, align 8
+  %dst.addr = alloca ptr, align 8
+  %scale.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  %prb_comp_in = alloca [3 x <8 x i16>], align 16
+  %__ret = alloca <8 x i8>, align 8
+  %tmp = alloca <8 x i8>, align 8
+  %__ret3 = alloca <8 x i8>, align 8
+  %tmp8 = alloca <8 x i8>, align 8
+  %__ret11 = alloca <8 x i8>, align 8
+  %tmp16 = alloca <8 x i8>, align 8
+  %prb_decomp = alloca [3 x <8 x i16>], align 16
+  %scaling_factor = alloca i16, align 2
+  %__s1 = alloca <8 x i16>, align 16
+  %agg.tmp = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp.coerce = alloca i64, align 8
+  %__s135 = alloca <8 x i16>, align 16
+  %agg.tmp37 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp37.coerce = alloca i64, align 8
+  %__s140 = alloca <8 x i16>, align 16
+  %agg.tmp42 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp42.coerce = alloca i64, align 8
+  %__s145 = alloca <8 x i16>, align 16
+  %__s148 = alloca <8 x i16>, align 16
+  %__s151 = alloca <8 x i16>, align 16
+  store i32 %n_prb, ptr %n_prb.addr, align 4
+  store ptr %src, ptr %src.addr, align 8
+  store ptr %dst, ptr %dst.addr, align 8
+  store ptr %scale, ptr %scale.addr, align 8
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %n_prb.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load ptr, ptr %src.addr, align 8
+  %3 = load i32, ptr %i, align 4
+  %idxprom = zext i32 %3 to i64
+  %arrayidx = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %2, i64 %idxprom
+  %mantissa = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx, i32 0, i32 1
+  %arrayidx1 = getelementptr inbounds [24 x i8], ptr %mantissa, i64 0, i64 0
+  %4 = load <8 x i8>, ptr %arrayidx1, align 1
+  store <8 x i8> %4, ptr %__ret, align 8
+  %5 = load <8 x i8>, ptr %__ret, align 8
+  store <8 x i8> %5, ptr %tmp, align 8
+  %6 = load <8 x i8>, ptr %tmp, align 8
+  store <8 x i8> %6, ptr %__p0.addr.i57, align 8
+  %7 = load <8 x i8>, ptr %__p0.addr.i57, align 8
+  %vmovl.i59 = sext <8 x i8> %7 to <8 x i16>
+  store <8 x i16> %vmovl.i59, ptr %__ret.i58, align 16
+  %8 = load <8 x i16>, ptr %__ret.i58, align 16
+  %arrayidx2 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  store <8 x i16> %8, ptr %arrayidx2, align 16
+  %9 = load ptr, ptr %src.addr, align 8
+  %10 = load i32, ptr %i, align 4
+  %idxprom4 = zext i32 %10 to i64
+  %arrayidx5 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %9, i64 %idxprom4
+  %mantissa6 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx5, i32 0, i32 1
+  %arrayidx7 = getelementptr inbounds [24 x i8], ptr %mantissa6, i64 0, i64 8
+  %11 = load <8 x i8>, ptr %arrayidx7, align 1
+  store <8 x i8> %11, ptr %__ret3, align 8
+  %12 = load <8 x i8>, ptr %__ret3, align 8
+  store <8 x i8> %12, ptr %tmp8, align 8
+  %13 = load <8 x i8>, ptr %tmp8, align 8
+  store <8 x i8> %13, ptr %__p0.addr.i54, align 8
+  %14 = load <8 x i8>, ptr %__p0.addr.i54, align 8
+  %vmovl.i56 = sext <8 x i8> %14 to <8 x i16>
+  store <8 x i16> %vmovl.i56, ptr %__ret.i55, align 16
+  %15 = load <8 x i16>, ptr %__ret.i55, align 16
+  %arrayidx10 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  store <8 x i16> %15, ptr %arrayidx10, align 16
+  %16 = load ptr, ptr %src.addr, align 8
+  %17 = load i32, ptr %i, align 4
+  %idxprom12 = zext i32 %17 to i64
+  %arrayidx13 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %16, i64 %idxprom12
+  %mantissa14 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx13, i32 0, i32 1
+  %arrayidx15 = getelementptr inbounds [24 x i8], ptr %mantissa14, i64 0, i64 16
+  %18 = load <8 x i8>, ptr %arrayidx15, align 1
+  store <8 x i8> %18, ptr %__ret11, align 8
+  %19 = load <8 x i8>, ptr %__ret11, align 8
+  store <8 x i8> %19, ptr %tmp16, align 8
+  %20 = load <8 x i8>, ptr %tmp16, align 8
+  store <8 x i8> %20, ptr %__p0.addr.i, align 8
+  %21 = load <8 x i8>, ptr %__p0.addr.i, align 8
+  %vmovl.i = sext <8 x i8> %21 to <8 x i16>
+  store <8 x i16> %vmovl.i, ptr %__ret.i, align 16
+  %22 = load <8 x i16>, ptr %__ret.i, align 16
+  %arrayidx18 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  store <8 x i16> %22, ptr %arrayidx18, align 16
+  %23 = load ptr, ptr %src.addr, align 8
+  %24 = load i32, ptr %i, align 4
+  %idxprom19 = zext i32 %24 to i64
+  %arrayidx20 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %23, i64 %idxprom19
+  %exp = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx20, i32 0, i32 0
+  %25 = load i8, ptr %exp, align 1
+  %conv = sext i8 %25 to i32
+  %mul = mul nsw i32 %conv, 2
+  %conv21 = trunc i32 %mul to i16
+  store i16 %conv21, ptr %scaling_factor, align 2
+  %arrayidx22 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  %26 = load <8 x i16>, ptr %arrayidx22, align 16
+  %27 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %26, ptr %__p0.addr.i75, align 16
+  store i16 %27, ptr %__p1.addr.i76, align 2
+  %28 = load <8 x i16>, ptr %__p0.addr.i75, align 16
+  %29 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit.i79 = insertelement <8 x i16> poison, i16 %29, i32 0
+  %30 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit1.i80 = insertelement <8 x i16> %vecinit.i79, i16 %30, i32 1
+  %31 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit2.i81 = insertelement <8 x i16> %vecinit1.i80, i16 %31, i32 2
+  %32 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit3.i82 = insertelement <8 x i16> %vecinit2.i81, i16 %32, i32 3
+  %33 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit4.i83 = insertelement <8 x i16> %vecinit3.i82, i16 %33, i32 4
+  %34 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit5.i84 = insertelement <8 x i16> %vecinit4.i83, i16 %34, i32 5
+  %35 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit6.i85 = insertelement <8 x i16> %vecinit5.i84, i16 %35, i32 6
+  %36 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit7.i86 = insertelement <8 x i16> %vecinit6.i85, i16 %36, i32 7
+  store <8 x i16> %vecinit7.i86, ptr %.compoundliteral.i78, align 16
+  %37 = load <8 x i16>, ptr %.compoundliteral.i78, align 16
+  %mul.i87 = mul <8 x i16> %28, %37
+  store <8 x i16> %mul.i87, ptr %__ret.i77, align 16
+  %38 = load <8 x i16>, ptr %__ret.i77, align 16
+  %arrayidx24 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  store <8 x i16> %38, ptr %arrayidx24, align 16
+  %arrayidx25 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  %39 = load <8 x i16>, ptr %arrayidx25, align 16
+  %40 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %39, ptr %__p0.addr.i62, align 16
+  store i16 %40, ptr %__p1.addr.i63, align 2
+  %41 = load <8 x i16>, ptr %__p0.addr.i62, align 16
+  %42 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit.i66 = insertelement <8 x i16> poison, i16 %42, i32 0
+  %43 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit1.i67 = insertelement <8 x i16> %vecinit.i66, i16 %43, i32 1
+  %44 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit2.i68 = insertelement <8 x i16> %vecinit1.i67, i16 %44, i32 2
+  %45 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit3.i69 = insertelement <8 x i16> %vecinit2.i68, i16 %45, i32 3
+  %46 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit4.i70 = insertelement <8 x i16> %vecinit3.i69, i16 %46, i32 4
+  %47 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit5.i71 = insertelement <8 x i16> %vecinit4.i70, i16 %47, i32 5
+  %48 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit6.i72 = insertelement <8 x i16> %vecinit5.i71, i16 %48, i32 6
+  %49 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit7.i73 = insertelement <8 x i16> %vecinit6.i72, i16 %49, i32 7
+  store <8 x i16> %vecinit7.i73, ptr %.compoundliteral.i65, align 16
+  %50 = load <8 x i16>, ptr %.compoundliteral.i65, align 16
+  %mul.i74 = mul <8 x i16> %41, %50
+  store <8 x i16> %mul.i74, ptr %__ret.i64, align 16
+  %51 = load <8 x i16>, ptr %__ret.i64, align 16
+  %arrayidx27 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  store <8 x i16> %51, ptr %arrayidx27, align 16
+  %arrayidx28 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  %52 = load <8 x i16>, ptr %arrayidx28, align 16
+  %53 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %52, ptr %__p0.addr.i60, align 16
+  store i16 %53, ptr %__p1.addr.i, align 2
+  %54 = load <8 x i16>, ptr %__p0.addr.i60, align 16
+  %55 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %55, i32 0
+  %56 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %56, i32 1
+  %57 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %57, i32 2
+  %58 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %58, i32 3
+  %59 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %59, i32 4
+  %60 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %60, i32 5
+  %61 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %61, i32 6
+  %62 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %62, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %63 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  %mul.i = mul <8 x i16> %54, %63
+  store <8 x i16> %mul.i, ptr %__ret.i61, align 16
+  %64 = load <8 x i16>, ptr %__ret.i61, align 16
+  %arrayidx30 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  store <8 x i16> %64, ptr %arrayidx30, align 16
+  %65 = load ptr, ptr %scale.addr, align 8
+  %cmp31 = icmp ne ptr %65, null
+  br i1 %cmp31, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx32 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %66 = load <8 x i16>, ptr %arrayidx32, align 16
+  %67 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp, ptr align 2 %67, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp.coerce, ptr align 2 %agg.tmp, i64 4, i1 false)
+  %68 = load i64, ptr %agg.tmp.coerce, align 8
+  %call33 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %66, i64 %68)
+  store <8 x i16> %call33, ptr %__s1, align 16
+  %69 = load ptr, ptr %dst.addr, align 8
+  %arrayidx34 = getelementptr inbounds %struct.cmplx_int16_t, ptr %69, i64 0
+  %70 = load <8 x i16>, ptr %__s1, align 16
+  %71 = bitcast <8 x i16> %70 to <16 x i8>
+  %72 = bitcast <16 x i8> %71 to <8 x i16>
+  store <8 x i16> %72, ptr %arrayidx34, align 2
+  %arrayidx36 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %73 = load <8 x i16>, ptr %arrayidx36, align 16
+  %74 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp37, ptr align 2 %74, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp37.coerce, ptr align 2 %agg.tmp37, i64 4, i1 false)
+  %75 = load i64, ptr %agg.tmp37.coerce, align 8
+  %call38 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %73, i64 %75)
+  store <8 x i16> %call38, ptr %__s135, align 16
+  %76 = load ptr, ptr %dst.addr, align 8
+  %arrayidx39 = getelementptr inbounds %struct.cmplx_int16_t, ptr %76, i64 4
+  %77 = load <8 x i16>, ptr %__s135, align 16
+  %78 = bitcast <8 x i16> %77 to <16 x i8>
+  %79 = bitcast <16 x i8> %78 to <8 x i16>
+  store <8 x i16> %79, ptr %arrayidx39, align 2
+  %arrayidx41 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %80 = load <8 x i16>, ptr %arrayidx41, align 16
+  %81 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp42, ptr align 2 %81, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp42.coerce, ptr align 2 %agg.tmp42, i64 4, i1 false)
+  %82 = load i64, ptr %agg.tmp42.coerce, align 8
+  %call43 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %80, i64 %82)
+  store <8 x i16> %call43, ptr %__s140, align 16
+  %83 = load ptr, ptr %dst.addr, align 8
+  %arrayidx44 = getelementptr inbounds %struct.cmplx_int16_t, ptr %83, i64 8
+  %84 = load <8 x i16>, ptr %__s140, align 16
+  %85 = bitcast <8 x i16> %84 to <16 x i8>
+  %86 = bitcast <16 x i8> %85 to <8 x i16>
+  store <8 x i16> %86, ptr %arrayidx44, align 2
+  %87 = load ptr, ptr %dst.addr, align 8
+  %add.ptr = getelementptr inbounds %struct.cmplx_int16_t, ptr %87, i64 12
+  store ptr %add.ptr, ptr %dst.addr, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %for.body
+  %arrayidx46 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %88 = load <8 x i16>, ptr %arrayidx46, align 16
+  store <8 x i16> %88, ptr %__s145, align 16
+  %89 = load ptr, ptr %dst.addr, align 8
+  %90 = load <8 x i16>, ptr %__s145, align 16
+  %91 = bitcast <8 x i16> %90 to <16 x i8>
+  %92 = bitcast <16 x i8> %91 to <8 x i16>
+  store <8 x i16> %92, ptr %89, align 2
+  %93 = load ptr, ptr %dst.addr, align 8
+  %add.ptr47 = getelementptr inbounds %struct.cmplx_int16_t, ptr %93, i64 4
+  store ptr %add.ptr47, ptr %dst.addr, align 8
+  %arrayidx49 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %94 = load <8 x i16>, ptr %arrayidx49, align 16
+  store <8 x i16> %94, ptr %__s148, align 16
+  %95 = load ptr, ptr %dst.addr, align 8
+  %96 = load <8 x i16>, ptr %__s148, align 16
+  %97 = bitcast <8 x i16> %96 to <16 x i8>
+  %98 = bitcast <16 x i8> %97 to <8 x i16>
+  store <8 x i16> %98, ptr %95, align 2
+  %99 = load ptr, ptr %dst.addr, align 8
+  %add.ptr50 = getelementptr inbounds %struct.cmplx_int16_t, ptr %99, i64 4
+  store ptr %add.ptr50, ptr %dst.addr, align 8
+  %arrayidx52 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %100 = load <8 x i16>, ptr %arrayidx52, align 16
+  store <8 x i16> %100, ptr %__s151, align 16
+  %101 = load ptr, ptr %dst.addr, align 8
+  %102 = load <8 x i16>, ptr %__s151, align 16
+  %103 = bitcast <8 x i16> %102 to <16 x i8>
+  %104 = bitcast <16 x i8> %103 to <8 x i16>
+  store <8 x i16> %104, ptr %101, align 2
+  %105 = load ptr, ptr %dst.addr, align 8
+  %add.ptr53 = getelementptr inbounds %struct.cmplx_int16_t, ptr %105, i64 4
+  store ptr %add.ptr53, ptr %dst.addr, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %106 = load i32, ptr %i, align 4
+  %inc = add i32 %106, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0
+}
+
+define internal noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %a, i64 %scale.coerce) #0 {
+; CHECK-LABEL: define internal fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(
+; CHECK-SAME: <8 x i16> noundef [[A:%.*]], i64 [[SCALE_COERCE:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SCALE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_COERCE]] to i16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_SHIFT36:%.*]] = lshr i64 [[SCALE_COERCE]], 16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_SROA_2_0_EXTRACT_SHIFT36]] to i16
+; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    [[VECINIT_I19:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_0_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_2_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VECINIT7_I]])
+; CHECK-NEXT:    [[VBSL5_I:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <8 x i32> <i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8>
+; CHECK-NEXT:    [[SHUFFLE_I85:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE_I82:%.*]] = shufflevector <8 x i16> [[VECINIT_I19]], <8 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQDMULL_V2_I72:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I85]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I97:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I97]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[VQDMLAL2_I106:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I79]], <4 x i16> [[SHUFFLE_I76]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I107:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I72]], <4 x i32> [[VQDMLAL2_I106]])
+; CHECK-NEXT:    [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I91]], <4 x i16> [[SHUFFLE_I88]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I]], <4 x i32> [[VQDMLAL2_I]])
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I107]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE_I61:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I61]]
+;
+entry:
+  %__p0.addr.i102 = alloca <4 x i32>, align 16
+  %__p1.addr.i103 = alloca <4 x i16>, align 8
+  %__p2.addr.i104 = alloca <4 x i16>, align 8
+  %__ret.i105 = alloca <4 x i32>, align 16
+  %__p0.addr.i98 = alloca <4 x i32>, align 16
+  %__p1.addr.i99 = alloca <4 x i16>, align 8
+  %__p2.addr.i100 = alloca <4 x i16>, align 8
+  %__ret.i101 = alloca <4 x i32>, align 16
+  %__p0.addr.i95 = alloca <8 x i16>, align 16
+  %__ret.i96 = alloca <4 x i16>, align 8
+  %__p0.addr.i92 = alloca <8 x i16>, align 16
+  %__ret.i93 = alloca <4 x i16>, align 8
+  %__p0.addr.i89 = alloca <8 x i16>, align 16
+  %__ret.i90 = alloca <4 x i16>, align 8
+  %__p0.addr.i86 = alloca <8 x i16>, align 16
+  %__ret.i87 = alloca <4 x i16>, align 8
+  %__p0.addr.i83 = alloca <8 x i16>, align 16
+  %__ret.i84 = alloca <4 x i16>, align 8
+  %__p0.addr.i80 = alloca <8 x i16>, align 16
+  %__ret.i81 = alloca <4 x i16>, align 8
+  %__p0.addr.i77 = alloca <8 x i16>, align 16
+  %__ret.i78 = alloca <4 x i16>, align 8
+  %__p0.addr.i74 = alloca <8 x i16>, align 16
+  %__ret.i75 = alloca <4 x i16>, align 8
+  %__p0.addr.i69 = alloca <4 x i16>, align 8
+  %__p1.addr.i70 = alloca <4 x i16>, align 8
+  %__ret.i71 = alloca <4 x i32>, align 16
+  %__p0.addr.i66 = alloca <4 x i16>, align 8
+  %__p1.addr.i67 = alloca <4 x i16>, align 8
+  %__ret.i68 = alloca <4 x i32>, align 16
+  %__p0.addr.i64 = alloca <4 x i32>, align 16
+  %__ret.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <4 x i32>, align 16
+  %__ret.i63 = alloca <8 x i16>, align 16
+  %__p0.addr.i58 = alloca <8 x i16>, align 16
+  %__p1.addr.i59 = alloca <8 x i16>, align 16
+  %__ret.i60 = alloca <8 x i16>, align 16
+  %__p0.addr.i51 = alloca <4 x i32>, align 16
+  %__p1.addr.i52 = alloca <8 x i16>, align 16
+  %__p2.addr.i53 = alloca <8 x i16>, align 16
+  %__ret.i54 = alloca <4 x i32>, align 16
+  %a.addr.i46 = alloca <4 x i32>, align 16
+  %b.addr.i47 = alloca <8 x i16>, align 16
+  %c.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i40 = alloca <8 x i16>, align 16
+  %__p1.addr.i41 = alloca <8 x i16>, align 16
+  %__ret.i42 = alloca <4 x i32>, align 16
+  %a.addr.i = alloca <8 x i16>, align 16
+  %b.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i38 = alloca <8 x i16>, align 16
+  %__ret.i39 = alloca <8 x i16>, align 16
+  %__p0.addr.i36 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca <8 x i16>, align 16
+  %__p2.addr.i = alloca <8 x i16>, align 16
+  %__ret.i37 = alloca <8 x i16>, align 16
+  %__p0.addr.i29 = alloca i32, align 4
+  %__ret.i30 = alloca <4 x i32>, align 16
+  %.compoundliteral.i31 = alloca <4 x i32>, align 16
+  %__p0.addr.i27 = alloca <4 x i32>, align 16
+  %__ret.i28 = alloca <8 x i16>, align 16
+  %__p0.addr.i16 = alloca i16, align 2
+  %__ret.i17 = alloca <8 x i16>, align 16
+  %.compoundliteral.i18 = alloca <8 x i16>, align 16
+  %__p0.addr.i14 = alloca i16, align 2
+  %__ret.i15 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i16>, align 16
+  %__ret.i = alloca <8 x i16>, align 16
+  %scale = alloca %struct.cmplx_int16_t, align 2
+  %a.addr = alloca <8 x i16>, align 16
+  %a_rev = alloca <8 x i16>, align 16
+  %cc = alloca <8 x i16>, align 16
+  %dd = alloca <8 x i16>, align 16
+  %mult_mask = alloca <8 x i16>, align 16
+  %lo32 = alloca <4 x i32>, align 16
+  %hi32 = alloca <4 x i32>, align 16
+  %coerce.val.ii = trunc i64 %scale.coerce to i32
+  store i32 %coerce.val.ii, ptr %scale, align 2
+  store <8 x i16> %a, ptr %a.addr, align 16
+  %0 = load <8 x i16>, ptr %a.addr, align 16
+  store <8 x i16> %0, ptr %__p0.addr.i, align 16
+  %1 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %2 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %shuffle.i = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  store <8 x i16> %shuffle.i, ptr %__ret.i, align 16
+  %3 = load <8 x i16>, ptr %__ret.i, align 16
+  store <8 x i16> %3, ptr %a_rev, align 16
+  %re = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 0
+  %4 = load i16, ptr %re, align 2
+  store i16 %4, ptr %__p0.addr.i16, align 2
+  %5 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit.i19 = insertelement <8 x i16> poison, i16 %5, i32 0
+  %6 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit1.i20 = insertelement <8 x i16> %vecinit.i19, i16 %6, i32 1
+  %7 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit2.i21 = insertelement <8 x i16> %vecinit1.i20, i16 %7, i32 2
+  %8 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit3.i22 = insertelement <8 x i16> %vecinit2.i21, i16 %8, i32 3
+  %9 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit4.i23 = insertelement <8 x i16> %vecinit3.i22, i16 %9, i32 4
+  %10 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit5.i24 = insertelement <8 x i16> %vecinit4.i23, i16 %10, i32 5
+  %11 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit6.i25 = insertelement <8 x i16> %vecinit5.i24, i16 %11, i32 6
+  %12 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit7.i26 = insertelement <8 x i16> %vecinit6.i25, i16 %12, i32 7
+  store <8 x i16> %vecinit7.i26, ptr %.compoundliteral.i18, align 16
+  %13 = load <8 x i16>, ptr %.compoundliteral.i18, align 16
+  store <8 x i16> %13, ptr %__ret.i17, align 16
+  %14 = load <8 x i16>, ptr %__ret.i17, align 16
+  store <8 x i16> %14, ptr %cc, align 16
+  %im = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 1
+  %15 = load i16, ptr %im, align 2
+  store i16 %15, ptr %__p0.addr.i14, align 2
+  %16 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %16, i32 0
+  %17 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %17, i32 1
+  %18 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %18, i32 2
+  %19 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %19, i32 3
+  %20 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %20, i32 4
+  %21 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %21, i32 5
+  %22 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %22, i32 6
+  %23 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %23, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %24 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  store <8 x i16> %24, ptr %__ret.i15, align 16
+  %25 = load <8 x i16>, ptr %__ret.i15, align 16
+  store <8 x i16> %25, ptr %dd, align 16
+  store i32 65535, ptr %__p0.addr.i29, align 4
+  %26 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit.i32 = insertelement <4 x i32> poison, i32 %26, i32 0
+  %27 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit1.i33 = insertelement <4 x i32> %vecinit.i32, i32 %27, i32 1
+  %28 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit2.i34 = insertelement <4 x i32> %vecinit1.i33, i32 %28, i32 2
+  %29 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit3.i35 = insertelement <4 x i32> %vecinit2.i34, i32 %29, i32 3
+  store <4 x i32> %vecinit3.i35, ptr %.compoundliteral.i31, align 16
+  %30 = load <4 x i32>, ptr %.compoundliteral.i31, align 16
+  store <4 x i32> %30, ptr %__ret.i30, align 16
+  %31 = load <4 x i32>, ptr %__ret.i30, align 16
+  store <4 x i32> %31, ptr %__p0.addr.i27, align 16
+  %32 = load <4 x i32>, ptr %__p0.addr.i27, align 16
+  %33 = bitcast <4 x i32> %32 to <8 x i16>
+  store <8 x i16> %33, ptr %__ret.i28, align 16
+  %34 = load <8 x i16>, ptr %__ret.i28, align 16
+  store <8 x i16> %34, ptr %mult_mask, align 16
+  %35 = load <8 x i16>, ptr %mult_mask, align 16
+  %36 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %36, ptr %__p0.addr.i38, align 16
+  %37 = load <8 x i16>, ptr %__p0.addr.i38, align 16
+  %38 = bitcast <8 x i16> %37 to <16 x i8>
+  %vqnegq_v1.i = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %37)
+  %vqnegq_v2.i = bitcast <8 x i16> %vqnegq_v1.i to <16 x i8>
+  store <8 x i16> %vqnegq_v1.i, ptr %__ret.i39, align 16
+  %39 = load <8 x i16>, ptr %__ret.i39, align 16
+  %40 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %35, ptr %__p0.addr.i36, align 16
+  store <8 x i16> %39, ptr %__p1.addr.i, align 16
+  store <8 x i16> %40, ptr %__p2.addr.i, align 16
+  %41 = load <8 x i16>, ptr %__p0.addr.i36, align 16
+  %42 = bitcast <8 x i16> %41 to <16 x i8>
+  %43 = load <8 x i16>, ptr %__p1.addr.i, align 16
+  %44 = bitcast <8 x i16> %43 to <16 x i8>
+  %45 = load <8 x i16>, ptr %__p2.addr.i, align 16
+  %46 = bitcast <8 x i16> %45 to <16 x i8>
+  %vbsl3.i = and <8 x i16> %41, %43
+  %47 = xor <8 x i16> %41, splat (i16 -1)
+  %vbsl4.i = and <8 x i16> %47, %45
+  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
+  store <8 x i16> %vbsl5.i, ptr %__ret.i37, align 16
+  %48 = load <8 x i16>, ptr %__ret.i37, align 16
+  store <8 x i16> %48, ptr %dd, align 16
+  %49 = load <8 x i16>, ptr %a.addr, align 16
+  %50 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %49, ptr %a.addr.i, align 16
+  store <8 x i16> %50, ptr %b.addr.i, align 16
+  %51 = load <8 x i16>, ptr %a.addr.i, align 16
+  store <8 x i16> %51, ptr %__p0.addr.i83, align 16
+  %52 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %53 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %shuffle.i85 = shufflevector <8 x i16> %52, <8 x i16> %53, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i85, ptr %__ret.i84, align 8
+  %54 = load <4 x i16>, ptr %__ret.i84, align 8
+  %55 = load <8 x i16>, ptr %b.addr.i, align 16
+  store <8 x i16> %55, ptr %__p0.addr.i80, align 16
+  %56 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %57 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %shuffle.i82 = shufflevector <8 x i16> %56, <8 x i16> %57, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i82, ptr %__ret.i81, align 8
+  %58 = load <4 x i16>, ptr %__ret.i81, align 8
+  store <4 x i16> %54, ptr %__p0.addr.i69, align 8
+  store <4 x i16> %58, ptr %__p1.addr.i70, align 8
+  %59 = load <4 x i16>, ptr %__p0.addr.i69, align 8
+  %60 = bitcast <4 x i16> %59 to <8 x i8>
+  %61 = load <4 x i16>, ptr %__p1.addr.i70, align 8
+  %62 = bitcast <4 x i16> %61 to <8 x i8>
+  %vqdmull_v2.i72 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %59, <4 x i16> %61)
+  %vqdmull_v3.i73 = bitcast <4 x i32> %vqdmull_v2.i72 to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i72, ptr %__ret.i71, align 16
+  %63 = load <4 x i32>, ptr %__ret.i71, align 16
+  store <4 x i32> %63, ptr %lo32, align 16
+  %64 = load <8 x i16>, ptr %a.addr, align 16
+  %65 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %64, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %65, ptr %__p1.addr.i41, align 16
+  %66 = load <8 x i16>, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %66, ptr %__p0.addr.i95, align 16
+  %67 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %68 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %shuffle.i97 = shufflevector <8 x i16> %67, <8 x i16> %68, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i97, ptr %__ret.i96, align 8
+  %69 = load <4 x i16>, ptr %__ret.i96, align 8
+  %70 = load <8 x i16>, ptr %__p1.addr.i41, align 16
+  store <8 x i16> %70, ptr %__p0.addr.i92, align 16
+  %71 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %72 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %shuffle.i94 = shufflevector <8 x i16> %71, <8 x i16> %72, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i94, ptr %__ret.i93, align 8
+  %73 = load <4 x i16>, ptr %__ret.i93, align 8
+  store <4 x i16> %69, ptr %__p0.addr.i66, align 8
+  store <4 x i16> %73, ptr %__p1.addr.i67, align 8
+  %74 = load <4 x i16>, ptr %__p0.addr.i66, align 8
+  %75 = bitcast <4 x i16> %74 to <8 x i8>
+  %76 = load <4 x i16>, ptr %__p1.addr.i67, align 8
+  %77 = bitcast <4 x i16> %76 to <8 x i8>
+  %vqdmull_v2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %74, <4 x i16> %76)
+  %vqdmull_v3.i = bitcast <4 x i32> %vqdmull_v2.i to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i, ptr %__ret.i68, align 16
+  %78 = load <4 x i32>, ptr %__ret.i68, align 16
+  store <4 x i32> %78, ptr %__ret.i42, align 16
+  %79 = load <4 x i32>, ptr %__ret.i42, align 16
+  store <4 x i32> %79, ptr %hi32, align 16
+  %80 = load <4 x i32>, ptr %lo32, align 16
+  %81 = load <8 x i16>, ptr %a_rev, align 16
+  %82 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %80, ptr %a.addr.i46, align 16
+  store <8 x i16> %81, ptr %b.addr.i47, align 16
+  store <8 x i16> %82, ptr %c.addr.i, align 16
+  %83 = load <4 x i32>, ptr %a.addr.i46, align 16
+  %84 = load <8 x i16>, ptr %b.addr.i47, align 16
+  store <8 x i16> %84, ptr %__p0.addr.i77, align 16
+  %85 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %86 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %shuffle.i79 = shufflevector <8 x i16> %85, <8 x i16> %86, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i79, ptr %__ret.i78, align 8
+  %87 = load <4 x i16>, ptr %__ret.i78, align 8
+  %88 = load <8 x i16>, ptr %c.addr.i, align 16
+  store <8 x i16> %88, ptr %__p0.addr.i74, align 16
+  %89 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %90 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %shuffle.i76 = shufflevector <8 x i16> %89, <8 x i16> %90, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i76, ptr %__ret.i75, align 8
+  %91 = load <4 x i16>, ptr %__ret.i75, align 8
+  store <4 x i32> %83, ptr %__p0.addr.i102, align 16
+  store <4 x i16> %87, ptr %__p1.addr.i103, align 8
+  store <4 x i16> %91, ptr %__p2.addr.i104, align 8
+  %92 = load <4 x i32>, ptr %__p0.addr.i102, align 16
+  %93 = bitcast <4 x i32> %92 to <16 x i8>
+  %94 = load <4 x i16>, ptr %__p1.addr.i103, align 8
+  %95 = bitcast <4 x i16> %94 to <8 x i8>
+  %96 = load <4 x i16>, ptr %__p2.addr.i104, align 8
+  %97 = bitcast <4 x i16> %96 to <8 x i8>
+  %vqdmlal2.i106 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %94, <4 x i16> %96)
+  %vqdmlal_v3.i107 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %92, <4 x i32> %vqdmlal2.i106)
+  store <4 x i32> %vqdmlal_v3.i107, ptr %__ret.i105, align 16
+  %98 = load <4 x i32>, ptr %__ret.i105, align 16
+  store <4 x i32> %98, ptr %lo32, align 16
+  %99 = load <4 x i32>, ptr %hi32, align 16
+  %100 = load <8 x i16>, ptr %a_rev, align 16
+  %101 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %99, ptr %__p0.addr.i51, align 16
+  store <8 x i16> %100, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %101, ptr %__p2.addr.i53, align 16
+  %102 = load <4 x i32>, ptr %__p0.addr.i51, align 16
+  %103 = load <8 x i16>, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %103, ptr %__p0.addr.i89, align 16
+  %104 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %105 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %shuffle.i91 = shufflevector <8 x i16> %104, <8 x i16> %105, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i91, ptr %__ret.i90, align 8
+  %106 = load <4 x i16>, ptr %__ret.i90, align 8
+  %107 = load <8 x i16>, ptr %__p2.addr.i53, align 16
+  store <8 x i16> %107, ptr %__p0.addr.i86, align 16
+  %108 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %109 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %shuffle.i88 = shufflevector <8 x i16> %108, <8 x i16> %109, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i88, ptr %__ret.i87, align 8
+  %110 = load <4 x i16>, ptr %__ret.i87, align 8
+  store <4 x i32> %102, ptr %__p0.addr.i98, align 16
+  store <4 x i16> %106, ptr %__p1.addr.i99, align 8
+  store <4 x i16> %110, ptr %__p2.addr.i100, align 8
+  %111 = load <4 x i32>, ptr %__p0.addr.i98, align 16
+  %112 = bitcast <4 x i32> %111 to <16 x i8>
+  %113 = load <4 x i16>, ptr %__p1.addr.i99, align 8
+  %114 = bitcast <4 x i16> %113 to <8 x i8>
+  %115 = load <4 x i16>, ptr %__p2.addr.i100, align 8
+  %116 = bitcast <4 x i16> %115 to <8 x i8>
+  %vqdmlal2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %113, <4 x i16> %115)
+  %vqdmlal_v3.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %111, <4 x i32> %vqdmlal2.i)
+  store <4 x i32> %vqdmlal_v3.i, ptr %__ret.i101, align 16
+  %117 = load <4 x i32>, ptr %__ret.i101, align 16
+  store <4 x i32> %117, ptr %__ret.i54, align 16
+  %118 = load <4 x i32>, ptr %__ret.i54, align 16
+  store <4 x i32> %118, ptr %hi32, align 16
+  %119 = load <4 x i32>, ptr %lo32, align 16
+  store <4 x i32> %119, ptr %__p0.addr.i64, align 16
+  %120 = load <4 x i32>, ptr %__p0.addr.i64, align 16
+  %121 = bitcast <4 x i32> %120 to <8 x i16>
+  store <8 x i16> %121, ptr %__ret.i65, align 16
+  %122 = load <8 x i16>, ptr %__ret.i65, align 16
+  %123 = load <4 x i32>, ptr %hi32, align 16
+  store <4 x i32> %123, ptr %__p0.addr.i62, align 16
+  %124 = load <4 x i32>, ptr %__p0.addr.i62, align 16
+  %125 = bitcast <4 x i32> %124 to <8 x i16>
+  store <8 x i16> %125, ptr %__ret.i63, align 16
+  %126 = load <8 x i16>, ptr %__ret.i63, align 16
+  store <8 x i16> %122, ptr %__p0.addr.i58, align 16
+  store <8 x i16> %126, ptr %__p1.addr.i59, align 16
+  %127 = load <8 x i16>, ptr %__p0.addr.i58, align 16
+  %128 = load <8 x i16>, ptr %__p1.addr.i59, align 16
+  %shuffle.i61 = shufflevector <8 x i16> %127, <8 x i16> %128, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x i16> %shuffle.i61, ptr %__ret.i60, align 16
+  %129 = load <8 x i16>, ptr %__ret.i60, align 16
+  ret <8 x i16> %129
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) #2
+
+attributes #0 = { mustprogress noinline uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 1}
+!3 = !{!"clang version 20.0.0git"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+;.
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
index 6c45442..13ea35a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
index b2d6455..a38413f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 82b1cf9..f583a61 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target triple = "arm64-apple-darwin"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index 2f61c89..801a8a0 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S -o - %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx14.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
index b14a36c..2703d23 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 7fccfee..886e7a7 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -enable-matrix -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -enable-matrix -S %s | FileCheck %s
 
 target triple = "arm64-apple-ios"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
index 10b07ad..d340638 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt %s -mtriple=arm64-apple-macosx -passes='default<O3>' -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
+; RUN: opt %s -mtriple=arm64-apple-macosx -passes="default<O3>" -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
 
 declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
 declare ptr @__memcpy_chk(ptr, ptr, i64, i64)
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index b1d0c70..76d9d14 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
index 5b7622b..2ab6f2b 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                                        | FileCheck --check-prefix=OLDPM %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck --check-prefix=NEWPM %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck --check-prefix=NEWPM %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index aab787b..778f25f 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index ef65101..9032c36 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; This test after a lot of cleanup should produce pick a tail-predicated 8x
 ; vector loop. The 8x will be more profitable, to pick a VQDMULH.s16 instruction.
@@ -15,7 +15,7 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; CHECK:       while.body.preheader:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER18:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
 ; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
@@ -48,18 +48,18 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER18]]
-; CHECK:       while.body.preheader18:
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER15]]
+; CHECK:       while.body.preheader15:
 ; CHECK-NEXT:    [[BLKCNT_06_PH:%.*]] = phi i32 [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PSRCA_ADDR_05_PH:%.*]] = phi ptr [ [[PSRCA]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_04_PH:%.*]] = phi ptr [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PSRCB_ADDR_03_PH:%.*]] = phi ptr [ [[PSRCB]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[BLKCNT_06:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_06_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER18]] ]
+; CHECK-NEXT:    [[BLKCNT_06:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_06_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER15]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[PSRCA_ADDR_05]], i32 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll b/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
index 6cbba5c..664953a 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                   | FileCheck %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 57a3d81..5a3742c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -unroll-runtime -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -unroll-runtime -S %s | FileCheck %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 798824b..a4aea02 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
@@ -78,30 +78,16 @@ define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v8i16_u1234567(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; SSE4-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
+; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @add_v8i16_u1234567(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:    ret <8 x i16> [[RESULT]]
+; AVX-NEXT:    ret <8 x i16> [[TMP7]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -172,13 +158,10 @@ define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_u123(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -202,13 +185,10 @@ define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_0u23(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -232,40 +212,28 @@ define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: @add_v4i32_01u3(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v4i32_01u3(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE4-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
 ; SSE4-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; AVX2-LABEL: @add_v4i32_01u3(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX2-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; AVX512-LABEL: @add_v4i32_01u3(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
 ; AVX512-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -289,13 +257,10 @@ define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_012u(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -420,46 +385,30 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
 
 define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
 ; SSE2-LABEL: @add_v8i32_01234u67(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
-; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
-; SSE2-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
 ; SSE2-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v8i32_01234u67(
 ; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
 ; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
 ; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
-; SSE4-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP5]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8i32_01234u67(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; AVX-NEXT:    ret <8 x i32> [[RESULT]]
+; AVX-NEXT:    ret <8 x i32> [[TMP7]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -530,13 +479,10 @@ define <4 x float> @add_v4f32_0123(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @add_v4f32_u123(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -560,13 +506,10 @@ define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @add_v4f32_0u23(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -589,41 +532,11 @@ define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: @add_v4f32_01u3(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
-;
-; SSE4-LABEL: @add_v4f32_01u3(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; SSE4-NEXT:    ret <4 x float> [[RESULT]]
-;
-; AVX2-LABEL: @add_v4f32_01u3(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; AVX2-NEXT:    ret <4 x float> [[RESULT]]
-;
-; AVX512-LABEL: @add_v4f32_01u3(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
-; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-LABEL: @add_v4f32_01u3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -647,39 +560,27 @@ define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: @add_v4f32_012u(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
 ;
 ; SSE4-LABEL: @add_v4f32_012u(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
 ; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE4-NEXT:    ret <4 x float> [[RESULT]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX2-LABEL: @add_v4f32_012u(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
 ; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; AVX2-NEXT:    ret <4 x float> [[RESULT]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX512-LABEL: @add_v4f32_012u(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX512-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 ; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
@@ -804,33 +705,40 @@ define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @add_v8f32_012u4567(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
-; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:    ret <8 x float> [[RESULT]]
+; SSE2-LABEL: @add_v8f32_012u4567(
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8f32_012u4567(
+; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8f32_012u4567(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 14, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:    ret <8 x float> [[RESULT]]
+; AVX-NEXT:    ret <8 x float> [[TMP7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -983,13 +891,10 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_u123(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 1, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1034,13 +939,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_0u23(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1085,13 +987,10 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_01u3(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1136,13 +1035,10 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_012u(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
new file mode 100644
index 0000000..bcb316a
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -0,0 +1,1141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; PR34072 - failure to canonicalize to (sub (shuffle a, b),(shuffle a, b)) for optimal horizontal sub patterns (with undemanded elements)
+
+;
+; v8i16
+;
+
+define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @sub_v8i16_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = sub i16 %a0, %a1
+  %a23 = sub i16 %a2, %a3
+  %a45 = sub i16 %a4, %a5
+  %a67 = sub i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = sub i16 %b0, %b1
+  %b23 = sub i16 %b2, %b3
+  %b45 = sub i16 %b4, %b5
+  %b67 = sub i16 %b6, %b7
+  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: @sub_v8i16_u1234567(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
+; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
+; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
+; SSE2-NEXT:    [[A23:%.*]] = sub i16 [[A2]], [[A3]]
+; SSE2-NEXT:    [[A45:%.*]] = sub i16 [[A4]], [[A5]]
+; SSE2-NEXT:    [[A67:%.*]] = sub i16 [[A6]], [[A7]]
+; SSE2-NEXT:    [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
+; SSE2-NEXT:    [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
+; SSE2-NEXT:    [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8i16_u1234567(
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
+;
+; AVX-LABEL: @sub_v8i16_u1234567(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x i16> [[TMP7]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = sub i16 %a0, %a1
+  %a23 = sub i16 %a2, %a3
+  %a45 = sub i16 %a4, %a5
+  %a67 = sub i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = sub i16 %b0, %b1
+  %b23 = sub i16 %b2, %b3
+  %b45 = sub i16 %b4, %b5
+  %b67 = sub i16 %b6, %b7
+  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+;
+; v4i32
+;
+
+define <4 x i32> @sub_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_u123(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0u23(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: @sub_v4i32_01u3(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4i32_01u3(
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4i32_01u3(
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4i32_01u3(
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX512-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_012u(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x i32> %result
+}
+
+;
+; v8i32
+;
+
+define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @sub_v8i32_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %a45 = sub i32 %a4, %a5
+  %a67 = sub i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %b45 = sub i32 %b4, %b5
+  %b67 = sub i32 %b6, %b7
+  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: @sub_v8i32_01234u67(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v8i32_01234u67(
+; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT:    [[A45:%.*]] = sub i32 [[A4]], [[A5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8i32_01234u67(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %a45 = sub i32 %a4, %a5
+  %a67 = sub i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %b45 = sub i32 %b4, %b5
+  %b67 = sub i32 %b6, %b7
+  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+;
+; v4f32
+;
+
+define <4 x float> @sub_v4f32_0123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_u123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_u123(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0u23(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_01u3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @sub_v4f32_012u(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4f32_012u(
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4f32_012u(
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4f32_012u(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX512-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x float> %result
+}
+
+;
+; v8f32
+;
+
+define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @sub_v8f32_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %a45 = fsub float %a4, %a5
+  %a67 = fsub float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %b45 = fsub float %b4, %b5
+  %b67 = fsub float %b6, %b7
+  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: @sub_v8f32_012u4567(
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8f32_012u4567(
+; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8f32_012u4567(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x float> [[TMP7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %a45 = fsub float %a4, %a5
+  %a67 = fsub float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %b45 = fsub float %b4, %b5
+  %b67 = fsub float %b6, %b7
+  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+;
+; v2f64
+;
+
+define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_u1(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_0u(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
+  ret <2 x double> %result
+}
+
+;
+; v4f64
+;
+
+define <4 x double> @sub_v4f64_0123(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @sub_v4f64_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_u123(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_u123(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_u123(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_0u23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_0u23(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 2, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_0u23(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01u3(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_01u3(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_01u3(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_012u(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE2-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_012u(
+; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE4-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_012u(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_uu23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4f64_uu23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; AVX-LABEL: @sub_v4f64_uu23(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[RESULT1]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01uu(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    ret <4 x double> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f64_01uu(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX-LABEL: @sub_v4f64_01uu(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x double> %result
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index 4e1051d..d92df9741 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -31,13 +31,10 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[SHUFFLE]]
 ;
 ; AVX-LABEL: @PR50392(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[B:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B1:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B1]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[SHUFFLE]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index fe5bba1a..7bb22e2 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll b/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
index 2843a7e..126be02 100644
--- a/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
+++ b/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 ; These are tests that check for set/clear bits in a bitfield based on PR37098:
 ; https://bugs.llvm.org/show_bug.cgi?id=37098
diff --git a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
index 7ff3c5d..7cdddd1 100644
--- a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
+++ b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s --check-prefixes=CHECK,DEFAULT
 ; RUN: opt -S -passes='lto<O3>' < %s | FileCheck %s --check-prefixes=CHECK,LTO
 
 declare void @llvm.trap()
diff --git a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
index 689f4a9..641f216 100644
--- a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
+++ b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s --check-prefixes=ALL,O3
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=ALL,O3
 ; RUN: opt -passes='default<O2>' -S < %s  | FileCheck %s --check-prefixes=ALL,O2
 ; RUN: opt -passes='default<O1>' -S < %s  | FileCheck %s --check-prefixes=ALL,O1
 
diff --git a/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll b/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
index 2139542..82a453d 100644
--- a/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
+++ b/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes='default<O3>' -S < %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll b/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
index ba6c36a..cc20233a 100644
--- a/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                                        | FileCheck %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; This is based on the following most basic C++ code:
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll b/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
index 21fa234..1239b18 100644
--- a/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
+++ b/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
@@ -5,7 +5,7 @@
 ; RUN: opt < %s -passes='default<O0>' -S | FileCheck %s --check-prefixes=CHECK,NOOPT
 ; RUN: opt < %s -passes='default<O1>' -S | FileCheck %s --check-prefixes=CHECK,OPT
 ; RUN: opt < %s -passes='default<O2>' -S | FileCheck %s --check-prefixes=CHECK,OPT
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s --check-prefixes=CHECK,OPT
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
index bae3e26..c6b5e5f 100644
--- a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
+++ b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefix=HOIST
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefix=HOIST
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefix=ROTATE
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefix=ROTATE
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefix=HOIST
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefix=HOIST
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefix=ROTATE
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefix=ROTATE
 
 ; This example is produced from a very basic C code:
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/pr32544.ll b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
index 421260b..135084d 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr32544.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 define void @foo(i1 %which, i32 %a, i32 %b, ptr %result) {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/PhaseOrdering/pr45682.ll b/llvm/test/Transforms/PhaseOrdering/pr45682.ll
index 2230580..46ee191 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr45682.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr45682.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 define void @PR45682(i32 %x, i32 %y) {
 ; CHECK-LABEL: @PR45682(
diff --git a/llvm/test/Transforms/PhaseOrdering/pr62311.ll b/llvm/test/Transforms/PhaseOrdering/pr62311.ll
index 03276d8..027df7d 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr62311.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr62311.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; C++ version of test case
 ; #include <x86intrin.h>
diff --git a/llvm/test/Transforms/PhaseOrdering/pr95152.ll b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
index 016460f..6941ea2 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr95152.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 ; Make sure that interaction of "writable" with various passes does not
 ; result in the elimination of the store prior to @j().
diff --git a/llvm/test/Transforms/PhaseOrdering/rotate.ll b/llvm/test/Transforms/PhaseOrdering/rotate.ll
index 9ce1969..9179edc 100644
--- a/llvm/test/Transforms/PhaseOrdering/rotate.ll
+++ b/llvm/test/Transforms/PhaseOrdering/rotate.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 ; This should become a single funnel shift through a combination
 ; of aggressive-instcombine, simplifycfg, and instcombine.
diff --git a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
index 03df138..9da46bd 100644
--- a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
+++ b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes='default<O1>' -S < %s | FileCheck %s
 ; RUN: opt -passes='default<O2>' -S < %s | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s | FileCheck %s
 
 ; We are worse at propagating correlation facts when in select form
 ; as compared to the PHI form, so if we lower switches to early,
diff --git a/llvm/test/Transforms/PhaseOrdering/switch-sext.ll b/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
index 0e352ba..3fbb02d 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 define i8 @test_switch_with_sext_phi(i8 %code) {
 ; CHECK-LABEL: define noundef i8 @test_switch_with_sext_phi(
diff --git a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
index d2f33f9..33266ca 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes='default<O1>' < %s | FileCheck %s
 ; RUN: opt -S -passes='default<O2>' < %s | FileCheck %s
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 target datalayout = "n64"
 
diff --git a/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll b/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll
new file mode 100644
index 0000000..e81c5d7
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=ipsccp %s | FileCheck %s
+
+declare void @use(i1)
+
+define i1 @test1(i64 %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i64 [[X:%.*]], 65535
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], -65521
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cond = icmp ugt i64 %x, 65535
+  call void @llvm.assume(i1 %cond)
+  %mask = and i64 %x, -65521
+  %cmp = icmp eq i64 %mask, 0
+  ret i1 %cmp
+}
+
+define void @test.and(i64 %x, i64 %y) {
+; CHECK-LABEL: @test.and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0:%.*]] = icmp uge i64 [[X:%.*]], 138
+; CHECK-NEXT:    [[C1:%.*]] = icmp ule i64 [[X]], 161
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C0]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i64 [[Y:%.*]], 186
+; CHECK-NEXT:    [[C3:%.*]] = icmp ule i64 [[Y]], 188
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C3]])
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[R1:%.*]] = icmp ult i64 [[AND]], 137
+; CHECK-NEXT:    call void @use(i1 [[R1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0 = icmp uge i64 %x, 138 ; 0b10001010
+  %c1 = icmp ule i64 %x, 161 ; 0b10100000
+  call void @llvm.assume(i1 %c0)
+  call void @llvm.assume(i1 %c1)
+  %c2 = icmp uge i64 %y, 186 ; 0b10111010
+  %c3 = icmp ule i64 %y, 188 ; 0b10111110
+  call void @llvm.assume(i1 %c2)
+  call void @llvm.assume(i1 %c3)
+  %and = and i64 %x, %y
+  %r0 = icmp ult i64 %and, 136 ; 0b10001000
+  call void @use(i1 %r0) ; false
+  %r1 = icmp ult i64 %and, 137
+  call void @use(i1 %r1) ; unknown
+  ret void
+}
+
+define void @test.or(i64 %x, i64 %y) {
+; CHECK-LABEL: @test.or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0:%.*]] = icmp ule i64 [[X:%.*]], 117
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i64 [[X]], 95
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C0]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
+; CHECK-NEXT:    [[C2:%.*]] = icmp ule i64 [[Y:%.*]], 69
+; CHECK-NEXT:    [[C3:%.*]] = icmp uge i64 [[Y]], 67
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C3]])
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[R1:%.*]] = icmp ugt i64 [[OR]], 118
+; CHECK-NEXT:    call void @use(i1 [[R1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0 = icmp ule i64 %x, 117 ; 0b01110101
+  %c1 = icmp uge i64 %x, 95  ; 0b01011111
+  call void @llvm.assume(i1 %c0)
+  call void @llvm.assume(i1 %c1)
+  %c2 = icmp ule i64 %y, 69  ; 0b01000101
+  %c3 = icmp uge i64 %y, 67  ; 0b01000011
+  call void @llvm.assume(i1 %c2)
+  call void @llvm.assume(i1 %c3)
+  %or = or i64 %x, %y
+  %r0 = icmp ugt i64 %or, 119 ; 0b01110111
+  call void @use(i1 %r0) ; false
+  %r1 = icmp ugt i64 %or, 118
+  call void @use(i1 %r1) ; unknown
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 6dceabe..00a4417 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -80,16 +80,11 @@ define half @reduce_fast_half8(<8 x half> %vec8) {
 ; NOFP16-LABEL: define half @reduce_fast_half8(
 ; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
 ; NOFP16-NEXT:  [[ENTRY:.*:]]
-; NOFP16-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; NOFP16-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; NOFP16-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; NOFP16-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
 ; NOFP16-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; NOFP16-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
-; NOFP16-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
-; NOFP16-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
-; NOFP16-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
-; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
+; NOFP16-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NOFP16-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]])
+; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
 ; NOFP16-NEXT:    ret half [[OP_RDX3]]
 ;
 ; FULLFP16-LABEL: define half @reduce_fast_half8(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 289807a..3cab4a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -48,12 +48,12 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP17]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x float> [ [[TMP31]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ]
 ; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 2, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 2, i32 5, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 2, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 18, i32 6, i32 7, i32 8, i32 20, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 18, i32 6, i32 7, i32 8, i32 20, i32 10, i32 11, i32 12, i32 21, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x float> [[TMP40]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 5, i32 13, i32 9, i32 9>
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index 8093285..a504f3e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -8,56 +8,34 @@
 define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
-; CHECK-NEXT:    [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00
-; CHECK-NEXT:    [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00
-; CHECK-NEXT:    [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP2]], <2 x float> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
 ; CHECK:       bb18:
-; CHECK-NEXT:    [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ]
-; CHECK-NEXT:    [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ]
-; CHECK-NEXT:    [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ]
-; CHECK-NEXT:    [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x float> [ [[TMP7]], [[BB:%.*]] ]
+; CHECK-NEXT:    [[VAL16:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
 ; CHECK-NEXT:    [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00
+; CHECK-NEXT:    [[VAL17:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
 ; CHECK-NEXT:    [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00
 ; CHECK-NEXT:    br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
 ; CHECK:       bb25:
-; CHECK-NEXT:    [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP8]], [[BB18]] ]
 ; CHECK-NEXT:    br label [[BB30:%.*]]
 ; CHECK:       bb30:
 ; CHECK-NEXT:    [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
 ; CHECK-NEXT:    [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT:    [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT:    [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float
-; CHECK-NEXT:    [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1
-; CHECK-NEXT:    [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1
-; CHECK-NEXT:    [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float
-; CHECK-NEXT:    [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2
-; CHECK-NEXT:    [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1
-; CHECK-NEXT:    [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float
-; CHECK-NEXT:    [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3
-; CHECK-NEXT:    [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1
-; CHECK-NEXT:    [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float
-; CHECK-NEXT:    [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]]
-; CHECK-NEXT:    [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]]
-; CHECK-NEXT:    [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]]
-; CHECK-NEXT:    [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]]
-; CHECK-NEXT:    [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]]
-; CHECK-NEXT:    [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]]
-; CHECK-NEXT:    [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]]
-; CHECK-NEXT:    [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]]
-; CHECK-NEXT:    [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]]
-; CHECK-NEXT:    [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]]
-; CHECK-NEXT:    [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP15]])
 ; CHECK-NEXT:    [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
 ; CHECK-NEXT:    [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]])
 ; CHECK-NEXT:    call void @ham(float [[VAL55]], float [[VAL56]])
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 912d60d..257e466 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -30,11 +30,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; CHECK-NEXT:    [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
@@ -50,7 +50,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; CHECK-NEXT:    [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
@@ -61,14 +61,14 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
+; CHECK-NEXT:    [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
-; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP45]], [[TMP43]]
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
+; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]]
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = load i8, ptr null, align 1
@@ -79,12 +79,12 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
 ; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP76]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], splat (i32 16)
-; CHECK-NEXT:    [[TMP90:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
+; CHECK-NEXT:    [[TMP81:%.*]] = sub <2 x i32> [[TMP48]], [[TMP76]]
+; CHECK-NEXT:    [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16)
+; CHECK-NEXT:    [[TMP75:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]]
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
@@ -93,236 +93,236 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
 ; CHECK-NEXT:    [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]]
-; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison)
-; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; CHECK-NEXT:    [[TMP100:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]]
-; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16)
-; CHECK-NEXT:    [[TMP74:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
-; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
-; CHECK-NEXT:    [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
-; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]]
-; CHECK-NEXT:    [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP43]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[ADD48_3]], i32 0
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i32> [[TMP43]], i32 [[ADD55_3]], i32 0
-; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP122]], [[TMP72]]
-; CHECK-NEXT:    [[ADD55_4:%.*]] = add i32 [[TMP107]], [[SUB51_3]]
-; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <2 x i32> [[TMP126]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[TMP107]], i32 0
-; CHECK-NEXT:    [[TMP143:%.*]] = sub <2 x i32> [[TMP129]], [[TMP130]]
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD48_2]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_4]]
-; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
-; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
-; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
+; CHECK-NEXT:    [[TMP170:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison)
+; CHECK-NEXT:    [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32>
+; CHECK-NEXT:    [[TMP172:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32>
+; CHECK-NEXT:    [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16)
+; CHECK-NEXT:    [[TMP69:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]]
+; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0
+; CHECK-NEXT:    [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]]
+; CHECK-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP176]], [[TMP197]]
+; CHECK-NEXT:    [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0
+; CHECK-NEXT:    [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = add i32 [[XOR_I63_2]], [[ADD112_2]]
+; CHECK-NEXT:    [[SUB47_3:%.*]] = sub i32 [[ADD112_2]], [[XOR_I63_2]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
+; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i32> [[TMP70]], i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP71]], [[TMP72]]
+; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
+; CHECK-NEXT:    [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[SUB47_3]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
+; CHECK-NEXT:    [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]]
+; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]]
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP45]], 15
+; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
+; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
+; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[CONV9_2]], 15
 ; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[ADD55_4]], [[ADD55_2]]
-; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_4]]
-; CHECK-NEXT:    [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
-; CHECK-NEXT:    [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
-; CHECK-NEXT:    [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
-; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
+; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15
 ; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT:    [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
-; CHECK-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP123]], i32 0
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP123]], i32 1
-; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP144]], [[TMP145]]
-; CHECK-NEXT:    [[TMP169:%.*]] = sub i32 [[TMP145]], [[TMP144]]
+; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]]
+; CHECK-NEXT:    [[ADD112_1:%.*]] = sub i32 [[TMP87]], [[TMP86]]
 ; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
-; CHECK-NEXT:    [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP146]], [[TMP147]]
-; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP147]], [[TMP146]]
-; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
-; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
-; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT:    [[TMP148:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP149:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
-; CHECK-NEXT:    [[TMP150:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP109:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]]
-; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], splat (i32 16)
-; CHECK-NEXT:    [[TMP112:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
-; CHECK-NEXT:    [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]]
-; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], splat (i32 16)
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP120]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]]
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
-; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP101:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]]
-; CHECK-NEXT:    [[TMP151:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
-; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
-; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]]
-; CHECK-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP111]], [[TMP99]]
-; CHECK-NEXT:    [[TMP153:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0
-; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP157]], [[TMP153]]
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP153]], [[TMP157]]
-; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
+; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
+; CHECK-NEXT:    [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32>
+; CHECK-NEXT:    [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; CHECK-NEXT:    [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
+; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]]
+; CHECK-NEXT:    [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16)
+; CHECK-NEXT:    [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
+; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32>
+; CHECK-NEXT:    [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]]
+; CHECK-NEXT:    [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16)
+; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
+; CHECK-NEXT:    [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]]
+; CHECK-NEXT:    [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]]
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]]
+; CHECK-NEXT:    [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP126:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]]
+; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]]
+; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0
+; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP127]], [[TMP120]]
+; CHECK-NEXT:    [[TMP166:%.*]] = sub i32 [[TMP120]], [[TMP127]]
+; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0
+; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1
+; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]]
+; CHECK-NEXT:    [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]]
+; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP127]], 15
+; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
+; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP129]], 15
 ; CHECK-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
 ; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; CHECK-NEXT:    [[SHR_I59_4:%.*]] = lshr i32 [[TMP157]], 15
-; CHECK-NEXT:    [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
-; CHECK-NEXT:    [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
-; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
-; CHECK-NEXT:    [[TMP158:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT:    [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT:    [[TMP121:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP116:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; CHECK-NEXT:    [[TMP159:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]]
-; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], splat (i32 16)
-; CHECK-NEXT:    [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; CHECK-NEXT:    [[TMP160:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
-; CHECK-NEXT:    [[TMP171:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32>
-; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP161]], [[TMP172]]
-; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16)
-; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP137]], [[TMP191]]
-; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP136]], [[TMP173]]
-; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP133]]
-; CHECK-NEXT:    [[TMP192:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP174]], [[TMP192]]
-; CHECK-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP192]], [[TMP174]]
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT:    [[TMP142:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP142]], [[TMP139]]
-; CHECK-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP139]], [[TMP142]]
-; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0
-; CHECK-NEXT:    [[SUB47_1:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[SUB47_1]], [[TMP138]]
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP138]], [[SUB47_1]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP142]], 15
+; CHECK-NEXT:    [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
+; CHECK-NEXT:    [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32>
+; CHECK-NEXT:    [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32>
+; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]]
+; CHECK-NEXT:    [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16)
+; CHECK-NEXT:    [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32>
+; CHECK-NEXT:    [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
+; CHECK-NEXT:    [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32>
+; CHECK-NEXT:    [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]]
+; CHECK-NEXT:    [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16)
+; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]]
+; CHECK-NEXT:    [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]]
+; CHECK-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]]
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]]
+; CHECK-NEXT:    [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]]
+; CHECK-NEXT:    [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]]
+; CHECK-NEXT:    [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]]
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]]
+; CHECK-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0
+; CHECK-NEXT:    [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1
+; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
+; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15
 ; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT:    [[TMP154:%.*]] = lshr <2 x i32> [[TMP110]], splat (i32 15)
-; CHECK-NEXT:    [[TMP184:%.*]] = and <2 x i32> [[TMP154]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP184]], splat (i32 65535)
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD55]]
-; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD55]], [[ADD48_1]]
-; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
-; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
-; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]]
+; CHECK-NEXT:    [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15)
+; CHECK-NEXT:    [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535)
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
+; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD78]]
+; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]]
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]]
+; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]]
-; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]]
+; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP45]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP142]]
-; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
-; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP99]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]]
+; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP127]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
+; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
+; CHECK-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
 ; CHECK-NEXT:    [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
 ; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
-; CHECK-NEXT:    [[ADD103_2:%.*]] = add i32 [[ADD94_5]], [[ADD103_1]]
-; CHECK-NEXT:    [[SUB104_2:%.*]] = sub i32 [[ADD103_1]], [[ADD94_5]]
-; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB104_1]]
-; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB104_1]], [[SUB102_1]]
-; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]]
-; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]]
-; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]]
-; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]]
-; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_2]]
-; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
-; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]]
-; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP157]]
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
+; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
+; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[CONV9_2]]
+; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
+; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]]
+; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
+; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]]
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]]
 ; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]]
-; CHECK-NEXT:    [[TMP170:%.*]] = sub i32 [[SUB51]], [[SUB45_1]]
-; CHECK-NEXT:    [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
-; CHECK-NEXT:    [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]]
-; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
-; CHECK-NEXT:    [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]]
-; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]]
-; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
-; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT:    [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]]
-; CHECK-NEXT:    [[TMP152:%.*]] = xor <2 x i32> [[TMP197]], [[TMP110]]
-; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP111]], 15
-; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
-; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP111]]
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP175]]
-; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP176]]
-; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
-; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
+; CHECK-NEXT:    [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
+; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]]
+; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]]
+; CHECK-NEXT:    [[TMP204:%.*]] = sub i32 [[TMP166]], [[SUB51_1]]
 ; CHECK-NEXT:    [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
 ; CHECK-NEXT:    [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
 ; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]]
-; CHECK-NEXT:    [[TMP182:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]]
-; CHECK-NEXT:    [[TMP183:%.*]] = shufflevector <2 x i32> [[TMP181]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
-; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
-; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]]
-; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
+; CHECK-NEXT:    [[TMP199:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT:    [[TMP200:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]]
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP204]], [[ADD112_1]]
+; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD113_1]]
+; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
+; CHECK-NEXT:    [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP201]]
+; CHECK-NEXT:    [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]]
+; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15
+; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
+; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
+; CHECK-NEXT:    [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]]
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]]
+; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1
+; CHECK-NEXT:    [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]]
+; CHECK-NEXT:    [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]]
+; CHECK-NEXT:    [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]]
+; CHECK-NEXT:    [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]]
+; CHECK-NEXT:    [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0
+; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]]
+; CHECK-NEXT:    [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]]
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]]
+; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]]
+; CHECK-NEXT:    [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]]
+; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]]
 ; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP183]]
+; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]]
 ; CHECK-NEXT:    [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]]
 ; CHECK-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
 ; CHECK-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
 ; CHECK-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]]
 ; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0
 ; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]]
 ; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll
new file mode 100644
index 0000000..cf1ed54
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v < %s |  FileCheck %s
+
+define i64 @test(ptr %arg, i32 %arg1, i64 %i) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: ptr [[ARG:%.*]], i32 [[ARG1:%.*]], i64 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[I]]
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr i8, ptr [[I2]], i64 [[I]]
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr i8, ptr [[I3]], i64 [[I]]
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr [[I4]], i64 [[I]]
+; CHECK-NEXT:    [[I6:%.*]] = getelementptr i8, ptr [[I5]], i64 [[I]]
+; CHECK-NEXT:    [[I7:%.*]] = getelementptr i8, ptr [[I6]], i64 [[I]]
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr [[I7]], i64 [[I]]
+; CHECK-NEXT:    [[I9:%.*]] = getelementptr i8, ptr [[I8]], i64 [[I]]
+; CHECK-NEXT:    [[I10:%.*]] = getelementptr i8, ptr [[I9]], i64 [[I]]
+; CHECK-NEXT:    [[I11:%.*]] = getelementptr i8, ptr [[I10]], i64 [[I]]
+; CHECK-NEXT:    [[I12:%.*]] = getelementptr i8, ptr [[I11]], i64 [[I]]
+; CHECK-NEXT:    [[I13:%.*]] = getelementptr i8, ptr [[I12]], i64 [[I]]
+; CHECK-NEXT:    [[I14:%.*]] = getelementptr i8, ptr [[I13]], i64 [[I]]
+; CHECK-NEXT:    [[I140:%.*]] = load i8, ptr [[I14]], align 1
+; CHECK-NEXT:    [[I1412:%.*]] = zext i8 [[I140]] to i32
+; CHECK-NEXT:    [[I142:%.*]] = mul i32 [[ARG1]], [[I1412]]
+; CHECK-NEXT:    [[I143:%.*]] = getelementptr i8, ptr [[I13]], i64 15
+; CHECK-NEXT:    [[I144:%.*]] = load i8, ptr [[I143]], align 1
+; CHECK-NEXT:    [[I1453:%.*]] = zext i8 [[I144]] to i32
+; CHECK-NEXT:    [[I146:%.*]] = mul i32 [[ARG1]], [[I1453]]
+; CHECK-NEXT:    [[I147:%.*]] = getelementptr i8, ptr [[I13]], i64 14
+; CHECK-NEXT:    [[I148:%.*]] = load i8, ptr [[I147]], align 1
+; CHECK-NEXT:    [[I1494:%.*]] = zext i8 [[I148]] to i32
+; CHECK-NEXT:    [[I150:%.*]] = mul i32 [[ARG1]], [[I1494]]
+; CHECK-NEXT:    [[I151:%.*]] = getelementptr i8, ptr [[I13]], i64 13
+; CHECK-NEXT:    [[I152:%.*]] = load i8, ptr [[I151]], align 1
+; CHECK-NEXT:    [[I1535:%.*]] = zext i8 [[I152]] to i32
+; CHECK-NEXT:    [[I154:%.*]] = mul i32 [[ARG1]], [[I1535]]
+; CHECK-NEXT:    [[I1311:%.*]] = or i32 [[I142]], [[I146]]
+; CHECK-NEXT:    [[I1312:%.*]] = or i32 [[I1311]], [[I150]]
+; CHECK-NEXT:    [[I1313:%.*]] = or i32 [[I1312]], [[I154]]
+; CHECK-NEXT:    [[I1536:%.*]] = zext i32 [[I1313]] to i64
+; CHECK-NEXT:    ret i64 [[I1536]]
+;
+bb:
+  %i2 = getelementptr i8, ptr %arg, i64 %i
+  %i3 = getelementptr i8, ptr %i2, i64 %i
+  %i4 = getelementptr i8, ptr %i3, i64 %i
+  %i5 = getelementptr i8, ptr %i4, i64 %i
+  %i6 = getelementptr i8, ptr %i5, i64 %i
+  %i7 = getelementptr i8, ptr %i6, i64 %i
+  %i8 = getelementptr i8, ptr %i7, i64 %i
+  %i9 = getelementptr i8, ptr %i8, i64 %i
+  %i10 = getelementptr i8, ptr %i9, i64 %i
+  %i11 = getelementptr i8, ptr %i10, i64 %i
+  %i12 = getelementptr i8, ptr %i11, i64 %i
+  %i13 = getelementptr i8, ptr %i12, i64 %i
+  %i14 = getelementptr i8, ptr %i13, i64 %i
+  %i140 = load i8, ptr %i14, align 1
+  %i1412 = zext i8 %i140 to i32
+  %i142 = mul i32 %arg1, %i1412
+  %i143 = getelementptr i8, ptr %i13, i64 15
+  %i144 = load i8, ptr %i143, align 1
+  %i1453 = zext i8 %i144 to i32
+  %i146 = mul i32 %arg1, %i1453
+  %i147 = getelementptr i8, ptr %i13, i64 14
+  %i148 = load i8, ptr %i147, align 1
+  %i1494 = zext i8 %i148 to i32
+  %i150 = mul i32 %arg1, %i1494
+  %i151 = getelementptr i8, ptr %i13, i64 13
+  %i152 = load i8, ptr %i151, align 1
+  %i1535 = zext i8 %i152 to i32
+  %i154 = mul i32 %arg1, %i1535
+  %i1311 = or i32 %i142, %i146
+  %i1312 = or i32 %i1311, %i150
+  %i1313 = or i32 %i1312, %i154
+  %i1536 = zext i32 %i1313 to i64
+  ret i64 %i1536
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 0771fab..e0b3ff7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -428,7 +428,7 @@ define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 [[C:%.*]], i1 false
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[C:%.*]], i1 [[TMP3]], i1 false
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
@@ -456,7 +456,7 @@ define i1 @logical_or_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 true, i1 [[C:%.*]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[C:%.*]], i1 true, i1 [[TMP3]]
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index 360b258..f875d45 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -14,7 +14,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
index 371b230..afca39a 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
@@ -12,7 +12,8 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 0, i16 poison>, i16 [[CALL37]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 5
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 3, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, i16 [[CALL37]], i32 6
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL]], i32 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret void
 ;
@@ -43,7 +44,8 @@ define void @test1() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison, i16 0>, i16 [[CALL]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL37]], i32 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, i16 [[CALL]], i32 6
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL37]], i32 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 261ec2b..40568f9 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -1,31 +1,56 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix AARCH64 %}
 
 define i1 @test(float %0, double %1) {
-; CHECK-LABEL: define i1 @test
-; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP10]], i64 0)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float>
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]])
-; CHECK-NEXT:    ret i1 [[TMP22]]
+; X86-LABEL: define i1 @test
+; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
+; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
+; X86-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
+; X86-NEXT:    [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; X86-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP10]], i64 0)
+; X86-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
+; X86-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4)
+; X86-NEXT:    [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]]
+; X86-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]]
+; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float>
+; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer
+; X86-NEXT:    [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer
+; X86-NEXT:    [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]]
+; X86-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]])
+; X86-NEXT:    ret i1 [[TMP22]]
+;
+; AARCH64-LABEL: define i1 @test
+; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
+; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
+; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
+; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 poison, i32 7>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP8]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; AARCH64-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
+; AARCH64-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP12]], i64 0)
+; AARCH64-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP14]], <2 x double> [[TMP6]], i64 4)
+; AARCH64-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP13]], [[TMP15]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP13]], [[TMP15]]
+; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
+; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
+; AARCH64-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
+; AARCH64-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
+; AARCH64-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
+; AARCH64-NEXT:    ret i1 [[TMP23]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll
index dbd9119..c704baaa 100644
--- a/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll
@@ -24,7 +24,9 @@ define void @test(ptr %p1, ptr %0, i32 %1, i1 %c1, ptr %p2) {
 ; CHECK:       [[L47]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x ptr> [[TMP25]], <2 x ptr> [[TMP26]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <2 x ptr> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> zeroinitializer, <2 x i32> [[TMP16]]
diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
index a854c61..a42c8f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
@@ -11,8 +11,8 @@ define i32 @test(i8 %0) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i8> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr null, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 poison, i32 poison, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> <i8 0, i8 0, i8 poison, i8 0, i8 0, i8 poison, i8 0, i8 0>, <8 x i32> <i32 8, i32 9, i32 0, i32 11, i32 12, i32 1, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> <i8 0, i8 0, i8 poison, i8 0, i8 0, i8 poison, i8 0, i8 0>, <8 x i8> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i8> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD:%.*]] = load i48, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i48> <i48 poison, i48 0, i48 0, i48 0>, i48 [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD]], i32 0
@@ -21,9 +21,9 @@ define i32 @test(i8 %0) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(21) null, align 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = load volatile i8, ptr null, align 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 0, i8 0, i8 0>, <8 x i32> <i32 8, i32 poison, i32 10, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 0, i8 0, i8 0>, i8 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 0, i8 0, i8 0, i8 0>, i8 [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x i8> [[TMP18]], i8 [[TMP13]], i32 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP17]], [[TMP19]]
diff --git a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
new file mode 100644
index 0000000..f0cfd99
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define i1 @test(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 -1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X0]], 0
+; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 0
+; CHECK-NEXT:    [[C2:%.*]] = icmp sgt i32 [[X2]], 0
+; CHECK-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze i1 [[C3]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 [[C1]], i1 false
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP1]], i1 [[OP_RDX]], i1 false
+; CHECK-NEXT:    ret i1 [[OP_RDX1]]
+;
+  %x0 = extractelement <4 x i32> %x, i32 0
+  %x1 = extractelement <4 x i32> %x, i32 -1
+  %x2 = extractelement <4 x i32> %x, i32 2
+  %x3 = extractelement <4 x i32> %x, i32 3
+  %2 = icmp ugt i32 %x0, 0
+  %c1 = icmp slt i32 %x1, 0
+  %c2 = icmp sgt i32 %x2, 0
+  %c3 = icmp slt i32 %x3, 0
+  %s1 = select i1 %2, i1 %c1, i1 false
+  %s2 = select i1 %s1, i1 %c3, i1 false
+  %s3 = select i1 %s2, i1 %c3, i1 false
+  ret i1 %s3
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
index 561182d..940ee5b 100644
--- a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
@@ -1,30 +1,54 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s --check-prefix X86 %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s --check-prefix AARCH64 %}
 
 define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
-; CHECK-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
-; CHECK-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
-; CHECK-NEXT:    ret i1 [[OP_RDX]]
+; X86-LABEL: @test(
+; X86-NEXT:  bb:
+; X86-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
+; X86-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
+; X86-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
+; X86-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
+; X86-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
+; X86-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
+; X86-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
+; X86-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1
+; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2
+; X86-NEXT:    [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]]
+; X86-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]])
+; X86-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP13]], false
+; X86-NEXT:    ret i1 [[OP_RDX]]
+;
+; AARCH64-LABEL: @test(
+; AARCH64-NEXT:  bb:
+; AARCH64-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
+; AARCH64-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
+; AARCH64-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
+; AARCH64-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
+; AARCH64-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
+; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
+; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
+; AARCH64-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
+; AARCH64-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
+; AARCH64-NEXT:    ret i1 [[OP_RDX]]
 ;
 bb:
   %i226 = getelementptr ptr, ptr %arg, i32 7
diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
index 61a84a6..056b622 100644
--- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
@@ -13,9 +13,9 @@ define void @func(i32 %0) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <32 x i32> <i32 poison, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[TMP11]], i32 30
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i32> [[TMP12]], <32 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 30, i32 30>
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24)
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index 7f484e2..ffbacc1 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -34,11 +34,14 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: @switch.table.unreachable_case = private unnamed_addr constant [9 x i32] [i32 0, i32 0, i32 0, i32 2, i32 -1, i32 1, i32 1, i32 1, i32 1], align 4
 ; CHECK: @switch.table.unreachable_default = private unnamed_addr constant [4 x i32] [i32 42, i32 52, i32 1, i32 2], align 4
 ; CHECK: @switch.table.nodefaultnoholes = private unnamed_addr constant [4 x i32] [i32 55, i32 123, i32 0, i32 -1], align 4
-; CHECK: @switch.table.nodefaultwithholes = private unnamed_addr constant [6 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 55, i32 -1], align 4
+; CHECK: @switch.table.nodefaultwithholes = private unnamed_addr constant [6 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 poison, i32 -1], align 4
 ; CHECK: @switch.table.threecases = private unnamed_addr constant [3 x i32] [i32 10, i32 7, i32 5], align 4
-; CHECK: @switch.table.covered_switch_with_bit_tests = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 1, i32 1], align 4
+; CHECK: @switch.table.covered_switch_with_bit_tests = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1], align 4
 ; CHECK: @switch.table.signed_overflow1 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 1111, i32 2222], align 4
-; CHECK: @switch.table.signed_overflow2 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 2222, i32 2222], align 4
+; CHECK: @switch.table.signed_overflow2 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 poison, i32 2222], align 4
+; CHECK: @switch.table.constant_hole_unreachable_default_firstundef = private unnamed_addr constant [5 x i32] [i32 undef, i32 poison, i32 1, i32 1, i32 1], align 4
+; CHECK: @switch.table.constant_hole_unreachable_default_lastundef = private unnamed_addr constant [5 x i32] [i32 1, i32 poison, i32 1, i32 1, i32 undef], align 4
+; CHECK: @switch.table.linearmap_hole_unreachable_default = private unnamed_addr constant [5 x i32] [i32 1, i32 poison, i32 5, i32 7, i32 9], align 4
 ;.
 define i32 @f(i32 %c) {
 ; CHECK-LABEL: @f(
@@ -2184,3 +2187,226 @@ return:                                           ; preds = %sw.default, %entry,
   %retval.0 = phi { i8, i8 } [ undef, %entry ], [ undef, %entry ], [ undef, %entry ], [ %1, %sw.default ]
   ret { i8, i8 } %retval.0
 }
+
+; The switch has a hole which falls through to an unreachable default case, but it can still be optimized into a constant load because
+; the poison value used for the hole is ignored.
+define i32 @constant_hole_unreachable_default(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb0
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+
+return:
+  %res = phi i32 [ 1, %bb0 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns undef, yet it cannot be optimized into a simple
+; constant because we actually treat undef as a unique value rather than ignoring it.
+define i32 @constant_hole_unreachable_default_firstundef(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_firstundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.constant_hole_unreachable_default_firstundef, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb.undef
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb0
+  ]
+
+sw.default: unreachable
+bb.undef: br label %return
+bb0: br label %return
+
+return:
+  %res = phi i32 [ undef, %bb.undef ], [ 1, %bb0 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case and the last case explicitly returns undef, yet it cannot be optimized into a simple
+; constant because we actually treat undef as a unique value rather than ignoring it.
+define i32 @constant_hole_unreachable_default_lastundef(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_lastundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.constant_hole_unreachable_default_lastundef, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb.undef
+  ]
+
+sw.default: unreachable
+bb.undef: br label %return
+bb0: br label %return
+
+return:
+  %res = phi i32 [ undef, %bb.undef ], [ 1, %bb0 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns poison, but it can still
+; be optimized into a constant load because the poison values are ignored.
+define i32 @constant_hole_unreachable_default_firstpoison(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_firstpoison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb.poison
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb0
+  ]
+
+sw.default: unreachable
+bb.poison: br label %return
+bb0: br label %return
+
+return:
+  %res = phi i32 [ poison, %bb.poison ], [ 1, %bb0 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns poison, but it can still
+; be optimized into a constant load because the poison values are ignored.
+define i32 @constant_hole_unreachable_default_lastpoison(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_lastpoison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb.poison
+  ]
+
+sw.default: unreachable
+bb.poison: br label %return
+bb0: br label %return
+
+return:
+  %res = phi i32 [ poison, %bb.poison ], [ 1, %bb0 ]
+  ret i32 %res
+}
+
+define i32 @constant_hole_unreachable_default_undef_poison(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_undef_poison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb.undef
+  i32 2, label %bb.poison
+  i32 3, label %bb.poison
+  i32 4, label %bb.poison
+  ]
+
+sw.default: unreachable
+bb.undef: br label %return
+bb.poison: br label %return
+
+return:
+  %res = phi i32 [ undef, %bb.undef ], [ poison, %bb.poison ]
+  ret i32 %res
+}
+
+define i32 @constant_hole_unreachable_default_poison_undef(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_poison_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb.poison
+  i32 2, label %bb.poison
+  i32 3, label %bb.poison
+  i32 4, label %bb.undef
+  ]
+
+sw.default: unreachable
+bb.undef: br label %return
+bb.poison: br label %return
+
+return:
+  %res = phi i32 [ undef, %bb.undef ], [ poison, %bb.poison ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case, which prevents it from being optimized into a linear mapping 2*x+1.
+; TODO: We should add support for this, at least in certain cases.
+define i32 @linearmap_hole_unreachable_default(i32 %x) {
+; CHECK-LABEL: @linearmap_hole_unreachable_default(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.linearmap_hole_unreachable_default, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb2
+  i32 3, label %bb3
+  i32 4, label %bb4
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+
+return:
+  %res = phi i32 [ 1, %bb0 ], [ 5, %bb2 ], [ 7, %bb3 ], [ 9, %bb4 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case, but it can still be optimized into a bitmask extraction because
+; the poison value used for the hole is simply replaced with zero.
+define i1 @bitset_hole_unreachable_default(i32 %x) {
+; CHECK-LABEL: @bitset_hole_unreachable_default(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i32 [[X:%.*]] to i5
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i5 [[SWITCH_CAST]], 1
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i5 8, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i5 [[SWITCH_DOWNSHIFT]] to i1
+; CHECK-NEXT:    ret i1 [[SWITCH_MASKED]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb0
+  i32 3, label %bb1
+  i32 4, label %bb0
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+bb1: br label %return
+
+return:
+  %res = phi i1 [ 0, %bb0 ], [ 1, %bb1 ]
+  ret i1 %res
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
index 7988e30..4ebf09a 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
@@ -7,11 +7,11 @@ target triple = "i386-pc-linux-gnu"
 ;.
 ; CHECK: @switch.table.reachable_default_dense_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1], align 4
 ; CHECK: @switch.table.unreachable_default_dense_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1], align 4
-; CHECK: @switch.table.reachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1], align 4
-; CHECK: @switch.table.unreachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1], align 4
+; CHECK: @switch.table.reachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1], align 4
+; CHECK: @switch.table.unreachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1], align 4
 ; CHECK: @switch.table.reachable_default_dense_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0], align 4
 ; CHECK: @switch.table.unreachable_default_dense_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0], align 4
-; CHECK: @switch.table.unreachable_default_holes_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1, i32 0], align 4
+; CHECK: @switch.table.unreachable_default_holes_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1, i32 0], align 4
 ;.
 define i32 @reachable_default_dense_0to31(i32 %x, i32 %y) {
 ; CHECK-LABEL: @reachable_default_dense_0to31(
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll
new file mode 100644
index 0000000..10c6aeb
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+;; Test to ensure that memprof related metadata is not dropped when
+;; instructions are combined. Currently the metadata from the first instruction
+;; is kept, which prevents full loss of profile context information.
+
+; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local noundef nonnull ptr @_Z4testb(i1 noundef zeroext %b) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local noundef nonnull ptr @_Z4testb(
+; CHECK-SAME: i1 noundef zeroext [[B:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof [[META0:![0-9]+]], !callsite [[META3:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !0, !callsite !3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !4, !callsite !7
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+
+declare ptr @_Znwm(i64) nounwind readonly
+
+!0 = !{!1}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434}
+!3 = !{i64 -852997907418798798}
+!4 = !{!5}
+!5 = !{!6, !"cold"}
+!6 = !{i64 123, i64 -2101080423462424381, i64 5188446645037944434}
+!7 = !{i64 123}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], !"notcold"}
+; CHECK: [[META2]] = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434}
+; CHECK: [[META3]] = !{i64 -852997907418798798}
+;.
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
index 92766d5..420e844 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
@@ -11,7 +11,7 @@ target triple = "nvptx64-nvidia-cuda"
 ;       use((b + i) * s);
 ;   }
 ; }
-define void @foo(i32 %b, i32 %s) {
+define ptx_kernel void @foo(i32 %b, i32 %s) {
 ; CHECK-LABEL: .visible .entry foo(
 entry:
 ; CHECK: ld.param.u32 [[s:%r[0-9]+]], [foo_param_1];
@@ -65,7 +65,3 @@ for.inc.3:                                        ; preds = %if.then.3, %for.inc
 declare zeroext i1 @cond(i32)
 
 declare void @use(i32)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
index 057d9af..c3639ba 100644
--- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
@@ -80,13 +80,29 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 }
 
 define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: @movmsk_i64_v64i8_v16i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
-; CHECK-NEXT:    ret i64 [[OR]]
+; SSE-LABEL: @movmsk_i64_v64i8_v16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; AVX512-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <16 x i8> %v0, zeroinitializer
   %c1 = icmp slt <16 x i8> %v1, zeroinitializer
@@ -110,14 +126,32 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2,
 }
 
 define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: @movmsk_i64_v32i32_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
-; CHECK-NEXT:    ret i64 [[OR]]
+; SSE-LABEL: @movmsk_i64_v32i32_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; SSE-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; AVX2-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; AVX512-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <4 x i32> %v0, zeroinitializer
   %c1 = icmp slt <4 x i32> %v1, zeroinitializer
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 800f576..f3b7f7b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -465,33 +465,13 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @PR34724(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 poison, i32 2, i32 4, i32 poison>
-; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @PR34724(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @PR34724(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[V3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 307fbf7..c125b73 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -465,34 +465,19 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @PR34724(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
-; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @PR34724(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @PR34724(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+; CHECK-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
index 5c856ce..cd2bc75 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -91,20 +91,11 @@ define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
   ret <4 x double> %r
 }
 
-; The vector fneg would cost twice as much as the scalar op with SSE,
-; so we don't transform there (the shuffle would also be more expensive).
-
 define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
-; SSE-LABEL: @ext7_v8f32(
-; SSE-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
-; SSE-NEXT:    [[N:%.*]] = fneg float [[E]]
-; SSE-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R]]
-;
-; AVX-LABEL: @ext7_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
-; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
-; AVX-NEXT:    ret <8 x float> [[R]]
+; CHECK-LABEL: @ext7_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %e = extractelement <8 x float> %x, i32 7
   %n = fneg float %e
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
new file mode 100644
index 0000000..0acfecc
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+
+; infinite loop if we add the erased instructions to the work list in the wrong order.
+define void @multiple_extract(ptr %p) {
+; CHECK-LABEL: @multiple_extract(
+; CHECK-NEXT:    [[VP:%.*]] = load ptr, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 0
+; CHECK-NEXT:    [[E0:%.*]] = load i32, ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 1
+; CHECK-NEXT:    [[E1:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[E0]], ptr [[P]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    store i32 [[E1]], ptr [[P1]], align 4
+; CHECK-NEXT:    ret void
+;
+  %vp = load ptr, ptr %p, align 8
+  %v = load <2 x i32>, ptr %vp, align 16
+  %e0 = extractelement <2 x i32> %v, i64 0
+  %e1 = extractelement <2 x i32> %v, i64 1
+  store i32 %e0, ptr %p, align 4
+  %p1 = getelementptr inbounds nuw i8, ptr %p, i64 4
+  store i32 %e1, ptr %p1, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 937d404..2db1e21 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -544,10 +544,7 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc
 ; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
 ; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
 ; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; CHECK-NEXT:    store <2 x float> [[T3]], ptr [[RESULTPTR:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index b3360b6..f9108ef 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
 
 declare void @use(<4 x i1>)
 
@@ -105,8 +105,8 @@ define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z,
 define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; SSE2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
 ; SSE2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
 ; SSE2-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; SSE2-NEXT:    ret <4 x i32> [[R]]
@@ -115,21 +115,29 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
 ; SSE4-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
 ; SSE4-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
 ; SSE4-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
 ; SSE4-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; SSE4-NEXT:    ret <4 x i32> [[R]]
 ;
-; AVX-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
-; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
-; AVX-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; AVX-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX-NEXT:    ret <4 x i32> [[R]]
+; AVX2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; AVX2-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
+; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
+; AVX512-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp oeq <4 x float> %x, %y
   %b1 = fcmp oeq <4 x float> %x, %z
-  %s = shufflevector <4 x i1> %b0, <4 x i1> %b1, <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+  %s = shufflevector <4 x i1> %b0, <4 x i1> %b1, <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
   %r = sext <4 x i1> %s to <4 x i32>
   ret <4 x i32> %r
 }
@@ -137,29 +145,13 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
 ; For commutative instructions, common operand may be swapped
 
 define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; SSE-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; SSE-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; SSE-NEXT:    ret <4 x i32> [[R]]
-;
-; AVX2-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
-; AVX2-NEXT:    [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
-; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX2-NEXT:    ret <4 x i32> [[R]]
-;
-; AVX512-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; AVX512-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX512-NEXT:    ret <4 x i32> [[R]]
+; CHECK-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp one <4 x float> %x, %y
   %b1 = fcmp one <4 x float> %z, %x
@@ -275,3 +267,33 @@ define <4 x i32> @shuf_icmp_ugt_v4i32_use(<4 x i32> %x, <4 x i32> %y, <4 x i32>
   %r = sext <4 x i1> %s to <4 x i32>
   ret <4 x i32> %r
 }
+
+; PR121110 - don't merge equivalent (but not matching) predicates
+
+define <2 x i1> @PR121110() {
+; CHECK-LABEL: define <2 x i1> @PR121110(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[UGT]], <2 x i1> [[SGT]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
+  %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
+  %res = shufflevector <2 x i1> %ugt, <2 x i1> %sgt, <2 x i32> <i32 0, i32 3>
+  ret <2 x i1> %res
+}
+
+define <2 x i1> @PR121110_commute() {
+; CHECK-LABEL: define <2 x i1> @PR121110_commute(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[SGT]], <2 x i1> [[UGT]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
+  %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
+  %res = shufflevector <2 x i1> %sgt, <2 x i1> %ugt, <2 x i32> <i32 0, i32 3>
+  ret <2 x i1> %res
+}
diff --git a/llvm/test/tools/UpdateTestChecks/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
index a954eb7..7147769 100644
--- a/llvm/test/tools/UpdateTestChecks/lit.local.cfg
+++ b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
@@ -10,7 +10,7 @@ except ImportError:
     from pipes import quote as shell_quote
 
 
-def add_update_script_substition(
+def add_update_script_substitution(
     name, python_exe=config.python_executable, extra_args=""
 ):
     assert name.startswith("%")
@@ -33,26 +33,26 @@ llc_path = os.path.join(config.llvm_tools_dir, "llc")
 if os.path.isfile(llc_path):
     config.available_features.add("llc-binary")
     llc_arg = "--llc-binary " + shell_quote(llc_path)
-    add_update_script_substition("%update_llc_test_checks", extra_args=llc_arg)
-    add_update_script_substition("%update_mir_test_checks", extra_args=llc_arg)
+    add_update_script_substitution("%update_llc_test_checks", extra_args=llc_arg)
+    add_update_script_substitution("%update_mir_test_checks", extra_args=llc_arg)
 
 opt_path = os.path.join(config.llvm_tools_dir, "opt")
 if os.path.isfile(opt_path):
     config.available_features.add("opt-binary")
     opt_arg = "--opt-binary " + shell_quote(opt_path)
-    add_update_script_substition("%update_test_checks", extra_args=opt_arg)
-    add_update_script_substition("%update_analyze_test_checks", extra_args=opt_arg)
+    add_update_script_substitution("%update_test_checks", extra_args=opt_arg)
+    add_update_script_substitution("%update_analyze_test_checks", extra_args=opt_arg)
 
 llvm_mca_path = os.path.join(config.llvm_tools_dir, "llvm-mca")
 if os.path.isfile(llvm_mca_path):
     config.available_features.add("llvm-mca-binary")
     mca_arg = "--llvm-mca-binary " + shell_quote(llvm_mca_path)
-    add_update_script_substition("%update_test_checks", extra_args=mca_arg)
+    add_update_script_substitution("%update_test_checks", extra_args=mca_arg)
 
 split_file_path = os.path.join(config.llvm_tools_dir, "split-file")
 if os.path.isfile(split_file_path):
-    add_update_script_substition("%update_test_body")
+    add_update_script_substitution("%update_test_body")
 
 llvm_mc_path = os.path.join(config.llvm_tools_dir, "llvm-mc")
 if os.path.isfile(llvm_mc_path):
-    add_update_script_substition("%update_mc_test_checks")
+    add_update_script_substitution("%update_mc_test_checks")
diff --git a/llvm/test/tools/dxil-dis/fastmath.ll b/llvm/test/tools/dxil-dis/fastmath.ll
new file mode 100644
index 0000000..7f4ba5b
--- /dev/null
+++ b/llvm/test/tools/dxil-dis/fastmath.ll
@@ -0,0 +1,23 @@
+; RUN: llc %s --filetype=obj -o - | dxil-dis -o - | FileCheck %s
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+define float @fma(float %0, float %1, float %2) #0 {
+  ; verify reassoc and contract are converted to fast
+  ; CHECK: %4 = fmul fast float %0, %1
+  %4 = fmul reassoc float %0, %1
+  ; CHECK-NEXT: %5 = fadd fast float %4, %2
+  %5 = fadd contract float %4, %2
+  ; verify these are converted to a single fast flag
+  ; CHECK-NEXT: %6 = fmul fast float %0, %1
+  %6 = fmul reassoc contract float %0, %1
+  ; verify these flags are maintained
+  ; CHECK-NEXT: %7 = fadd nnan ninf nsz arcp float %0, %1
+  %7 = fadd nnan ninf nsz arcp float %0, %1
+  ; verify that afn is removed
+  ; CHECK-NEXT: %8 = fmul float %0, %1
+  %8 = fmul afn float %0, %1
+  ret float %5
+}
+
+attributes #0 = { norecurse nounwind readnone willreturn "disable-tail-calls"="false" "waveops-include-helper-lanes" "fp32-denorm-mode"="any" "hlsl.export" }
+
diff --git a/llvm/test/tools/llvm-cov/branch-macros.test b/llvm/test/tools/llvm-cov/branch-macros.test
index e4bd14e..b16ef9d 100644
--- a/llvm/test/tools/llvm-cov/branch-macros.test
+++ b/llvm/test/tools/llvm-cov/branch-macros.test
@@ -1,5 +1,6 @@
 // RUN: llvm-profdata merge %S/Inputs/branch-macros.proftext -o %t.profdata
 // RUN: llvm-cov show --show-expansions --show-branches=count %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck %S/Inputs/branch-macros.cpp -check-prefixes=CHECK,BRCOV -D#C=999
+// RUN: llvm-cov show --binary-counters --show-expansions --show-branches=count %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck %S/Inputs/branch-macros.cpp -check-prefixes=CHECK,BRCOV -D#C=1
 // RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -show-functions -path-equivalence=/tmp,%S/Inputs %S/Inputs/branch-macros.cpp | FileCheck %s -check-prefix=REPORT
 
 // RUN: yaml2obj %S/Inputs/branch-macros-single.yaml -o %t.o
diff --git a/llvm/test/tools/llvm-cov/branch-noShowBranch.test b/llvm/test/tools/llvm-cov/branch-noShowBranch.test
index cabeeb0..9f3cfd5 100644
--- a/llvm/test/tools/llvm-cov/branch-noShowBranch.test
+++ b/llvm/test/tools/llvm-cov/branch-noShowBranch.test
@@ -12,7 +12,7 @@
 // REPORT-NOT: conditionals                     24       0 100.00%        15       0 100.00%        16       2  87.50%
 // REPORT-NOT: early_exits                      20       4  80.00%        25       2  92.00%        16       6  62.50%
 // REPORT-NOT: jumps                            39      12  69.23%        48       2  95.83%        26       9  65.38%
-// REPORT-NOT: switches                         28       5  82.14%        38       4  89.47%        30       9  70.00%
+// REPORT-NOT: switches                         28       5  82.14%        38       4  89.47%        28       7  75.00%
 // REPORT-NOT: big_switch                       25       1  96.00%        32       0 100.00%        30       6  80.00%
 // REPORT-NOT: boolean_operators                16       0 100.00%        13       0 100.00%        22       2  90.91%
 // REPORT-NOT: boolop_loops                     19       0 100.00%        14       0 100.00%        16       2  87.50%
@@ -21,5 +21,4 @@
 // REPORT-NOT: main                              1       0 100.00%        16       0 100.00%         0       0   0.00%
 // REPORT-NOT: c-general.c:static_func           4       0 100.00%         4       0 100.00%         2       0 100.00%
 // REPORT:     TOTAL                           197      24  87.82%       234       8  96.58%
-// REPORT-NOT: TOTAL                           197      24  87.82%       234      13  94.44%       174      38  78.16%
-
+// REPORT-NOT: TOTAL                           197      24  87.82%       234       8  96.58%       172      36  79.07%
diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts.test
index 4f505f96..a165d8d 100644
--- a/llvm/test/tools/llvm-cov/showLineExecutionCounts.test
+++ b/llvm/test/tools/llvm-cov/showLineExecutionCounts.test
@@ -3,6 +3,7 @@
 // RUN: llvm-profdata merge %S/Inputs/lineExecutionCounts.proftext -o %t.profdata
 
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck -check-prefixes=TEXT,WHOLE-FILE -D#C=999 -DC16K2=16.2k -DC16K1=16.1k %S/Inputs/showLineExecutionCounts.cpp
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -binary-counters -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck -check-prefixes=TEXT,WHOLE-FILE -D#C=1 -DC16K2=1 -DC16K1=1 %S/Inputs/showLineExecutionCounts.cpp
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs -name=main | FileCheck -check-prefixes=TEXT,FILTER -D#C=999 -DC16K2=16.2k -DC16K1=16.1k %S/Inputs/showLineExecutionCounts.cpp
 
 // Test -output-dir.
@@ -16,8 +17,10 @@
 //
 // Test html output.
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html.binary -binary-counters -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html.filtered -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs -name=main
 // RUN: FileCheck -check-prefixes=HTML,HTML-WHOLE-FILE -input-file %t.dir/html/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp
+// RUN: FileCheck -check-prefixes=HTML-BINARY,HTML-WHOLE-FILE -input-file %t.dir/html.binary/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp
 // RUN: FileCheck -check-prefixes=HTML,HTML-FILTER -input-file %t.dir/html.filtered/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp
 //
 // Test index creation.
diff --git a/llvm/test/tools/llvm-dlltool/machine-opt.def b/llvm/test/tools/llvm-dlltool/machine-opt.def
index 6dce825..fcb8529 100644
--- a/llvm/test/tools/llvm-dlltool/machine-opt.def
+++ b/llvm/test/tools/llvm-dlltool/machine-opt.def
@@ -6,6 +6,8 @@
 ; RUN: llvm-readobj %t.a | FileCheck --check-prefix=ARM %s
 ; RUN: llvm-dlltool -m arm64 -d %s -l %t.a
 ; RUN: llvm-readobj %t.a | FileCheck --check-prefix=ARM64 %s
+; RUN: llvm-dlltool -m r4000 -d %s -l %t.a
+; RUN: llvm-readobj %t.a | FileCheck --check-prefix=MIPS %s
 
 LIBRARY test.dll
 EXPORTS
@@ -15,3 +17,4 @@ TestFunction
 ; X86_64: Format: COFF-x86-64
 ; ARM: Format: COFF-ARM{{$}}
 ; ARM64: Format: COFF-ARM64
+; MIPS: Format: COFF-MIPS
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s b/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s
index 0ea3752..e7430e4 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s
@@ -1,5 +1,5 @@
 # REQUIRES: exegesis-can-measure-latency, x86_64-linux
 
-# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -opcode-name=ADD64rr -execution-mode=subprocess | FileCheck %s
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -opcode-name=ADD64rr -execution-mode=subprocess --benchmark-process-cpu=0 | FileCheck %s
 
 # CHECK: - { key: latency, value: {{[0-9.]*}}, per_snippet_value: {{[0-9.]*}}
diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
index 94a162c..522c576 100644
--- a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
+++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
@@ -64,6 +64,18 @@
 # CHECK-GSYM-KEEP-NEXT:       0x{{[0-9a-fA-F]+}} /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:10
 # CHECK-GSYM-KEEP-NEXT:       0x{{[0-9a-fA-F]+}} /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:6
 
+## Test the lookup functionality for merged functions:
+# RUN: llvm-gsymutil --verify %t.keep.gSYM --address 0x248 --merged-functions | FileCheck --check-prefix=CHECK-MERGED-LOOKUP %s
+# RUN: llvm-gsymutil --verify %t.keep.gSYM --address 0x248 | FileCheck --check-prefix=CHECK-NORMAL-LOOKUP %s
+
+#### TODO: Fix non-determinism leading that is currently worked around with `{{[1-3]}}` below.
+
+# CHECK-MERGED-LOOKUP: Found 3 functions at address 0x0000000000000248:
+# CHECK-MERGED-LOOKUP-NEXT:       0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
+# CHECK-MERGED-LOOKUP-NEXT-NEXT:  0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
+# CHECK-MERGED-LOOKUP-NEXT-NEXT:  0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
+ 
+# CHECK-NORMAL-LOOKUP: 0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
 
 
 --- !mach-o
diff --git a/llvm/test/tools/llvm-lib/Inputs/mips.ll b/llvm/test/tools/llvm-lib/Inputs/mips.ll
new file mode 100644
index 0000000..dd0f8338
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/Inputs/mips.ll
@@ -0,0 +1,7 @@
+target triple = "mipsel-windows-coff"
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @"?f@@YAXXZ"() #0 {
+entry:
+  ret void
+}
diff --git a/llvm/test/tools/llvm-lib/infer-machine.test b/llvm/test/tools/llvm-lib/infer-machine.test
new file mode 100644
index 0000000..23ecf25
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/infer-machine.test
@@ -0,0 +1,21 @@
+REQUIRES: mips-registered-target
+
+RUN: rm -rf %t && mkdir -p %t
+
+RUN: llc -mtriple=i386-windows-coff -filetype=obj -o %t/i386.obj %S/Inputs/i386.ll
+RUN: llvm-as %S/Inputs/i386.ll -o %t/i386.bc
+RUN: llvm-lib %t/i386.obj %t/i386.bc /out:%t/i386.lib
+RUN: llvm-objdump -h %t/i386.lib | FileCheck %s --check-prefix=I386
+I386: file format coff-i386
+
+RUN: llc -mtriple=x86_64-windows-coff -filetype=obj -o %t/x86_64.obj %S/Inputs/x86_64.ll
+RUN: llvm-as %S/Inputs/x86_64.ll -o %t/x86_64.bc
+RUN: llvm-lib %t/x86_64.obj %t/x86_64.bc /out:%t/x86_64.lib
+RUN: llvm-objdump -h %t/x86_64.lib | FileCheck %s --check-prefix=X86_64
+X86_64: file format coff-x86-64
+
+RUN: llc -mtriple=mipsel-windows-coff -filetype=obj -o %t/mips.obj %S/Inputs/mips.ll
+RUN: llvm-as %S/Inputs/mips.ll -o %t/mips.bc
+RUN: llvm-lib %t/mips.obj %t/mips.bc /out:%t/mips.lib
+RUN: llvm-objdump -h %t/mips.lib | FileCheck %s --check-prefix=MIPS
+MIPS: file format coff-mips
diff --git a/llvm/test/tools/llvm-lib/machine-opt.test b/llvm/test/tools/llvm-lib/machine-opt.test
new file mode 100644
index 0000000..e5ade82
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/machine-opt.test
@@ -0,0 +1,13 @@
+RUN: rm -f %t.lib
+
+RUN: llvm-lib /out:%t.lib /machine:i386 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+RUN: llvm-lib /out:%t.lib /machine:amd64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+
+RUN: llvm-lib /out:%t.lib /machine:mips 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+
+RUN: llvm-lib /out:%t.lib /machine:arm 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+RUN: llvm-lib /out:%t.lib /machine:arm64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+RUN: llvm-lib /out:%t.lib /machine:arm64x 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+
+EMPTYWARN: warning: no input files, not writing output file
+
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
index f4c4a20..cf1cf0e 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
@@ -1891,7 +1891,7 @@ drps
 # CHECK-NEXT:  1      2     0.50                        fmov	s0, s1
 # CHECK-NEXT:  1      2     0.50                        fabs	s2, s3
 # CHECK-NEXT:  1      2     0.50                        fneg	s4, s5
-# CHECK-NEXT:  1      9     1.00                        fsqrt	s6, s7
+# CHECK-NEXT:  9      9     9.00                        fsqrt	s6, s7
 # CHECK-NEXT:  1      3     1.00                        fcvt	d8, s9
 # CHECK-NEXT:  1      3     1.00                        fcvt	h10, s11
 # CHECK-NEXT:  1      3     1.00                        frintn	s12, s13
@@ -1904,7 +1904,7 @@ drps
 # CHECK-NEXT:  1      2     0.50                        fmov	d0, d1
 # CHECK-NEXT:  1      2     0.50                        fabs	d2, d3
 # CHECK-NEXT:  1      2     0.50                        fneg	d4, d5
-# CHECK-NEXT:  1      16    1.00                        fsqrt	d6, d7
+# CHECK-NEXT:  16     16    16.00                       fsqrt	d6, d7
 # CHECK-NEXT:  1      3     1.00                        fcvt	s8, d9
 # CHECK-NEXT:  1      3     1.00                        fcvt	h10, d11
 # CHECK-NEXT:  1      3     1.00                        frintn	d12, d13
@@ -1917,7 +1917,7 @@ drps
 # CHECK-NEXT:  1      3     1.00                        fcvt	s26, h27
 # CHECK-NEXT:  1      3     1.00                        fcvt	d28, h29
 # CHECK-NEXT:  1      3     0.50                        fmul	s20, s19, s17
-# CHECK-NEXT:  1      10    1.00                        fdiv	s1, s2, s3
+# CHECK-NEXT:  10     10    10.00                       fdiv	s1, s2, s3
 # CHECK-NEXT:  1      2     0.50                        fadd	s4, s5, s6
 # CHECK-NEXT:  1      2     0.50                        fsub	s7, s8, s9
 # CHECK-NEXT:  1      2     0.50                        fmax	s10, s11, s12
@@ -1926,7 +1926,7 @@ drps
 # CHECK-NEXT:  1      2     0.50                        fminnm	s19, s20, s21
 # CHECK-NEXT:  1      3     0.50                        fnmul	s22, s23, s2
 # CHECK-NEXT:  1      3     0.50                        fmul	d20, d19, d17
-# CHECK-NEXT:  1      15    1.00                        fdiv	d1, d2, d3
+# CHECK-NEXT:  15     15    15.00                       fdiv	d1, d2, d3
 # CHECK-NEXT:  1      2     0.50                        fadd	d4, d5, d6
 # CHECK-NEXT:  1      2     0.50                        fsub	d7, d8, d9
 # CHECK-NEXT:  1      2     0.50                        fmax	d10, d11, d12
@@ -2557,7 +2557,7 @@ drps
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]
-# CHECK-NEXT: 11.00  11.00  33.00  33.00  87.33  151.33 151.33 517.00 251.00 162.50 162.50 169.50 85.50
+# CHECK-NEXT: 11.00  11.00  33.00  33.00  87.33  151.33 151.33 517.00 251.00 162.50 162.50 215.50 85.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    Instructions:
@@ -3075,7 +3075,7 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmov	s0, s1
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fabs	s2, s3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fneg	s4, s5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fsqrt	s6, s7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     9.00    -     fsqrt	s6, s7
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	d8, s9
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	h10, s11
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     frintn	s12, s13
@@ -3088,7 +3088,7 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmov	d0, d1
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fabs	d2, d3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fneg	d4, d5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fsqrt	d6, d7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     16.00   -     fsqrt	d6, d7
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	s8, d9
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	h10, d11
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     frintn	d12, d13
@@ -3101,7 +3101,7 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	s26, h27
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	d28, h29
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmul	s20, s19, s17
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fdiv	s1, s2, s3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     10.00   -     fdiv	s1, s2, s3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fadd	s4, s5, s6
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fsub	s7, s8, s9
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmax	s10, s11, s12
@@ -3110,7 +3110,7 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fminnm	s19, s20, s21
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fnmul	s22, s23, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmul	d20, d19, d17
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fdiv	d1, d2, d3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     15.00   -     fdiv	d1, d2, d3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fadd	d4, d5, d6
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fsub	d7, d8, d9
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmax	d10, d11, d12
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
index 04c95f62f..36a2f04 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
@@ -325,11 +325,11 @@
 # CHECK-NEXT:  2      1     1.00           *            strd	r4, r5, [r12], -r10
 # CHECK-NEXT:  1      1     1.00           *            strh	r3, [r4]
 # CHECK-NEXT:  1      1     1.00           *            strh	r2, [r7, #4]
-# CHECK-NEXT:  2      1     1.00                  U     strh	r1, [r8, #64]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r1, [r8, #64]!
 # CHECK-NEXT:  2      1     1.00           *            strh	r12, [sp], #4
 # CHECK-NEXT:  1      1     1.00           *            strh	r6, [r5, r4]
-# CHECK-NEXT:  2      1     1.00                  U     strh	r3, [r8, r11]!
-# CHECK-NEXT:  2      1     1.00                  U     strh	r1, [r2, -r1]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r3, [r8, r11]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r1, [r2, -r1]!
 # CHECK-NEXT:  2      1     1.00           *            strh	r9, [r7], r2
 # CHECK-NEXT:  2      1     1.00           *            strh	r4, [r3], -r2
 # CHECK-NEXT:  2      1     1.00                  U     strht	r2, [r5], #76
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s
new file mode 100644
index 0000000..c42b4a9
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s
@@ -0,0 +1,1009 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p470 -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m8, tu, mu
+vdiv.vv v8, v16, v24
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m8, tu, mu
+vdiv.vx v8, v16, a0
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m8, tu, mu
+vfdiv.vv v8, v16, v24
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m8, tu, mu
+vfsqrt.v v8, v16
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      320
+# CHECK-NEXT: Total Cycles:      22358
+# CHECK-NEXT: Total uOps:        320
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.01
+# CHECK-NEXT: IPC:               0.01
+# CHECK-NEXT: Block RThroughput: 14361.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      102   102.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      204   204.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      90    90.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      180   180.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      360   360.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      84    84.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      168   168.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      336   336.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      144   144.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      288   288.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      576   576.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      102   102.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      204   204.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      90    90.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      180   180.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      360   360.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      84    84.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      168   168.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      336   336.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      144   144.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      288   288.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      576   576.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfsqrt.v	v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP400Div
+# CHECK-NEXT: [1]   - SiFiveP400FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP400FloatDiv
+# CHECK-NEXT: [3]   - SiFiveP400IEXQ0
+# CHECK-NEXT: [4]   - SiFiveP400IEXQ1
+# CHECK-NEXT: [5]   - SiFiveP400IEXQ2
+# CHECK-NEXT: [6]   - SiFiveP400Load
+# CHECK-NEXT: [7]   - SiFiveP400Store
+# CHECK-NEXT: [8]   - SiFiveP400VDiv
+# CHECK-NEXT: [9]   - SiFiveP400VEXQ0
+# CHECK-NEXT: [10]  - SiFiveP400VFloatDiv
+# CHECK-NEXT: [11]  - SiFiveP400VLD
+# CHECK-NEXT: [12]  - SiFiveP400VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -      -      -      -     160.00  -      -      -     12186.00 725.00 14361.00  -   -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     102.00 2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     204.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     90.00  2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     180.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     360.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     84.00  2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     168.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     336.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     144.00 2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     288.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     576.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     102.00 2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     204.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     90.00  2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     180.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     360.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     84.00  2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     168.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     336.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     144.00 2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     288.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     576.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfsqrt.v	v8, v16
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s
new file mode 100644
index 0000000..5f7a1d1
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p470 -iterations=1 < %s | FileCheck %s
+
+mul s6, s6, s7
+
+mulw s4, s4, a2
+
+cpop t1, t1
+
+cpopw t2, t2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     1.00                        mul	s6, s6, s7
+# CHECK-NEXT:  1      2     1.00                        mulw	s4, s4, a2
+# CHECK-NEXT:  1      2     1.00                        cpop	t1, t1
+# CHECK-NEXT:  1      2     1.00                        cpopw	t2, t2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP400Div
+# CHECK-NEXT: [1]   - SiFiveP400FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP400FloatDiv
+# CHECK-NEXT: [3]   - SiFiveP400IEXQ0
+# CHECK-NEXT: [4]   - SiFiveP400IEXQ1
+# CHECK-NEXT: [5]   - SiFiveP400IEXQ2
+# CHECK-NEXT: [6]   - SiFiveP400Load
+# CHECK-NEXT: [7]   - SiFiveP400Store
+# CHECK-NEXT: [8]   - SiFiveP400VDiv
+# CHECK-NEXT: [9]   - SiFiveP400VEXQ0
+# CHECK-NEXT: [10]  - SiFiveP400VFloatDiv
+# CHECK-NEXT: [11]  - SiFiveP400VLD
+# CHECK-NEXT: [12]  - SiFiveP400VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -      -      -      -      -     4.00    -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     mul	s6, s6, s7
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     mulw	s4, s4, a2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     cpop	t1, t1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     cpopw	t2, t2
diff --git a/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml b/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml
new file mode 100644
index 0000000..02a6e9d
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml
@@ -0,0 +1,63 @@
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4096
+  ImageBase:       268435456
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [  ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  Debug:
+    RelativeVirtualAddress: 8192
+    Size:            28
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE, IMAGE_FILE_DLL ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     18
+    SectionData:     5589E58B45108B450C8B450831C05DC20C00
+    SizeOfRawData:   512
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     109
+    SectionData:     000000008D6978670000000002000000510000001C2000001C060000525344538B301061671ED0994C4C44205044422E010000002F686F6D652F6D652F446F63756D656E74732F6C6C766D2D6D696E67772F6C6C766D2D70726F6A6563742F6C6C766D2F746573742E70646200
+    SizeOfRawData:   512
+  - Name:            .debug_abbrev
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  12288
+    VirtualSize:     78
+    SectionData:     011101250E1305030E10171B0E110112060000022E011101120640186E0E030E3A0B3B0B2719360B49133F1900000305000218030E3A0B3B0B49130000042400030E3E0B0B0B0000050F00000000
+    SizeOfRawData:   512
+  - Name:            .debug_info
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  16384
+    VirtualSize:     116
+    SectionData:     700000000400000000000401000000001D006E000000000000007500000000100010120000000200100010120000000155A5000000BC0000000101B16B00000003029108D70000000101720000000302910CD500000001016B00000003029110D30000000101720000000004CF00000005040500
+    SizeOfRawData:   512
+  - Name:            .debug_line
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  20480
+    VirtualSize:     60
+    SectionData:     3800000004001E000000010101FB0E0D00010101010000000100000100746573742E6300000000000005020010001001053D0ABA060B2E0204000101
+    SizeOfRawData:   512
+  - Name:            .debug_str
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  24576
+    VirtualSize:     217
+    SectionData:     636C616E672076657273696F6E2032302E302E30676974202868747470733A2F2F6769746875622E636F6D2F62796C6177732F6C6C766D2D70726F6A6563742E67697420393963353263306236613662396366303765383365656265393364323831333635656165383732332900746573742E63002F686F6D652F6D652F446F63756D656E74732F6C6C766D2D6D696E67772F6C6C766D2D70726F6A6563742F6C6C766D005F5F446C6C4D61696E43525453746172747570403132005F446C6C4D61696E4352545374617274757000696E7400630062006100
+    SizeOfRawData:   512
+symbols:         []
+...
diff --git a/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test b/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test
new file mode 100644
index 0000000..affd4b6
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test
@@ -0,0 +1,45 @@
+RUN: yaml2obj %p/Inputs/i386-debug-rdata.yaml -o %t.in.exe
+
+RUN: llvm-objcopy --only-keep-debug %t.in.exe %t.out.exe
+RUN: llvm-readobj --sections %t.out.exe | FileCheck %s
+
+Check that all non-debug/rodata (which contains the debug directory in this case)
+sections with IMAGE_SCN_CNT_CODE or IMAGE_SCN_CNT_INITIALIZED_DATA are truncated,
+and no others.
+
+CHECK:       Section {
+CHECK-NEXT:   Number: 1
+CHECK-NEXT:   Name: .text (2E 74 65 78 74 00 00 00)
+CHECK-NEXT:   VirtualSize: 0x12
+CHECK-NEXT:   VirtualAddress: 0x1000
+CHECK-NEXT:   RawDataSize: 0
+CHECK:       Section {
+CHECK-NEXT:   Number: 2
+CHECK-NEXT:   Name: .rdata (2E 72 64 61 74 61 00 00)
+CHECK-NEXT:   VirtualSize: 0x6D
+CHECK-NEXT:   VirtualAddress: 0x2000
+CHECK-NEXT:   RawDataSize: 512
+CHECK:       Section {
+CHECK-NEXT:   Number: 3
+CHECK-NEXT:   Name: .debug_abbrev (2F 34 00 00 00 00 00 00)
+CHECK-NEXT:   VirtualSize: 0x4E
+CHECK-NEXT:   VirtualAddress: 0x3000
+CHECK-NEXT:   RawDataSize: 512
+CHECK:       Section {
+CHECK-NEXT:   Number: 4
+CHECK-NEXT:   Name: .debug_info (2F 32 39 00 00 00 00 00)
+CHECK-NEXT:   VirtualSize: 0x74
+CHECK-NEXT:   VirtualAddress: 0x4000
+CHECK-NEXT:   RawDataSize: 512
+CHECK:       Section {
+CHECK-NEXT:   Number: 5
+CHECK-NEXT:   Name: .debug_line (2F 34 31 00 00 00 00 00)
+CHECK-NEXT:   VirtualSize: 0x3C
+CHECK-NEXT:   VirtualAddress: 0x5000
+CHECK-NEXT:   RawDataSize: 512
+CHECK:       Section {
+CHECK-NEXT:   Number: 6
+CHECK-NEXT:   Name: .debug_str (2F 31 38 00 00 00 00 00)
+CHECK-NEXT:   VirtualSize: 0xD9
+CHECK-NEXT:   VirtualAddress: 0x6000
+CHECK-NEXT:   RawDataSize: 512
diff --git a/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test
new file mode 100644
index 0000000..ea47d44
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test
@@ -0,0 +1,134 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --wildcard --globalize-symbol="*" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# RUN: echo "*" > %t-star.txt
+# RUN: llvm-objcopy --wildcard --globalize-symbols="%t-star.txt" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# CHECK:      Symbols [
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x1
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateExternalSymbol
+# CHECK-NEXT:     PrivateExtern
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x2
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x3
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _UndefinedExternalSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Undef (0x0)
+# CHECK-NEXT:     Section:  (0x0)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000FF8
+        size:            8
+        offset:          0xFF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00008052C0035FD6
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        67
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           4
+    stroff:          4164
+    strsize:         79
+LinkEditData:
+  NameList:
+    - n_strx:          2
+      n_type:          0x0E
+      n_sect:          1
+      n_desc:          0
+      n_value:         1
+    - n_strx:          17
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         2
+    - n_strx:          40
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3
+    - n_strx:          54
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _PrivateSymbol
+    - _PrivateExternalSymbol
+    - _CommonSymbol
+    - _UndefinedExternalSymbol
+...
diff --git a/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test
new file mode 100644
index 0000000..009a732
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test
@@ -0,0 +1,147 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --keep-global-symbol _CommonSymbol %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# RUN: echo _CommonSymbol > %t-sym-list.txt
+# RUN: llvm-objcopy --wildcard --keep-global-symbols="%t-sym-list.txt" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# CHECK:      Symbols [
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateSymbol
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x1
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateExternalSymbol
+# CHECK-NEXT:     PrivateExtern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x2
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol2
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x4
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x3
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _UndefinedExternalSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Undef (0x0)
+# CHECK-NEXT:     Section:  (0x0)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000FF8
+        size:            8
+        offset:          0xFF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00008052C0035FD6
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        94
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           5
+    stroff:          4176
+    strsize:         94
+LinkEditData:
+  NameList:
+    - n_strx:          2
+      n_type:          0x0E
+      n_sect:          1
+      n_desc:          0
+      n_value:         1
+    - n_strx:          17
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         2
+    - n_strx:          40
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3
+    - n_strx:          54
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4
+    - n_strx:          69
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _PrivateSymbol
+    - _PrivateExternalSymbol
+    - _CommonSymbol
+    - _CommonSymbol2
+    - _UndefinedExternalSymbol
+...
diff --git a/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test
new file mode 100644
index 0000000..131d3bf
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test
@@ -0,0 +1,131 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --wildcard --localize-symbol="*" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# RUN: echo "*" > %t-star.txt
+# RUN: llvm-objcopy --wildcard --localize-symbols="%t-star.txt" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# CHECK:      Symbols [
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateSymbol
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x1
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateExternalSymbol
+# CHECK-NEXT:     PrivateExtern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x2
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x3
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _UndefinedExternalSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Undef (0x0)
+# CHECK-NEXT:     Section:  (0x0)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000FF8
+        size:            8
+        offset:          0xFF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00008052C0035FD6
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        67
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           4
+    stroff:          4164
+    strsize:         79
+LinkEditData:
+  NameList:
+    - n_strx:          2
+      n_type:          0x0E
+      n_sect:          1
+      n_desc:          0
+      n_value:         1
+    - n_strx:          17
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         2
+    - n_strx:          40
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3
+    - n_strx:          54
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _PrivateSymbol
+    - _PrivateExternalSymbol
+    - _CommonSymbol
+    - _UndefinedExternalSymbol
+...
diff --git a/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test
new file mode 100644
index 0000000..0991fb3
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test
@@ -0,0 +1,148 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --wildcard --localize-symbol="*" --skip-symbol _CommonSymbol %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# RUN: echo "*" > %t-star.txt
+# RUN: echo _CommonSymbol > %t-sym-list.txt
+# RUN: llvm-objcopy --wildcard --localize-symbols="%t-star.txt" --skip-symbols="%t-sym-list.txt" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# CHECK:      Symbols [
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateSymbol
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x1
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateExternalSymbol
+# CHECK-NEXT:     PrivateExtern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x2
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol2
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x4
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x3
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _UndefinedExternalSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Undef (0x0)
+# CHECK-NEXT:     Section:  (0x0)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000FF8
+        size:            8
+        offset:          0xFF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00008052C0035FD6
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        94
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           5
+    stroff:          4176
+    strsize:         94
+LinkEditData:
+  NameList:
+    - n_strx:          2
+      n_type:          0x0E
+      n_sect:          1
+      n_desc:          0
+      n_value:         1
+    - n_strx:          17
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         2
+    - n_strx:          40
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3
+    - n_strx:          54
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4
+    - n_strx:          69
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _PrivateSymbol
+    - _PrivateExternalSymbol
+    - _CommonSymbol
+    - _CommonSymbol2
+    - _UndefinedExternalSymbol
+...
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
new file mode 100644
index 0000000..19b06b1
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
@@ -0,0 +1,217 @@
+# RUN: rm -rf %t && mkdir %t
+# RUN: yaml2obj %s -o %t/original
+# RUN: llvm-strip --strip-all %t/original -o %t/stripped
+# RUN: llvm-readobj --macho-segment %t/stripped | FileCheck %s
+
+# CHECK-LABEL: Name: __PAGEZERO
+# CHECK:       fileoff: 16384
+
+# CHECK-LABEL: Name: __TEXT
+# CHECK:       fileoff: 16384
+
+# The YAML below is the following code
+# int main(int argc, char **argv) { return 0; }
+# Compiled on macOS against the macOS SDK and passing `-Wl,-encryptable`
+# Contents are removed, since they are not important for the test. We need a
+# small text segment (smaller than a page).
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           15
+  sizeofcmds:      696
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          32768
+    fileoff:         0
+    filesize:        32768
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100004000
+        size:            32
+        offset:          0x4000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x100004020
+        size:            4152
+        offset:          0x4020
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4295000064
+    vmsize:          592
+    fileoff:         32768
+    filesize:        592
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_CHAINED_FIXUPS
+    cmdsize:         16
+    dataoff:         32768
+    datasize:        48
+  - cmd:             LC_DYLD_EXPORTS_TRIE
+    cmdsize:         16
+    dataoff:         32816
+    datasize:        48
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          32872
+    nsyms:           2
+    stroff:          32904
+    strsize:         32
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      2
+    iundefsym:       2
+    nundefsym:       0
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ENCRYPTION_INFO_64
+    cmdsize:         24
+    cryptoff:        16384
+    cryptsize:       16384
+    cryptid:         0
+    pad:             0
+  - cmd:             LC_LOAD_DYLINKER
+    cmdsize:         32
+    name:            12
+    Content:         '/usr/lib/dyld'
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C4447-5555-3144-A18A-01E9EB7E7D92
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           983040
+    sdk:             983552
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1310720
+  - cmd:             LC_MAIN
+    cmdsize:         24
+    entryoff:        16384
+    stacksize:       0
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         32864
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         32872
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         32944
+    datasize:        416
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      5
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    4
+            NodeOffset:      33
+            Name:            main
+            Flags:           0x0
+            Address:         0x4000
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    2
+            NodeOffset:      39
+            Name:            _mh_execute_header
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294983680
+    - n_strx:          8
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+  StringTable:
+    - ' '
+    - _main
+    - __mh_execute_header
+    - ''
+    - ''
+    - ''
+    - ''
+  FunctionStarts:  [ 0x4000 ]
+  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x30, 0x0, 
+                     0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+...
+
diff --git a/llvm/test/tools/llvm-profgen/context-depth.test b/llvm/test/tools/llvm-profgen/context-depth.test
new file mode 100644
index 0000000..4eaa5fa
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/context-depth.test
@@ -0,0 +1,125 @@
+; Test --csprof-max-context-depth and --csprof-max-unsymbolized-context-depth
+
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csspgo-preinliner=0 --gen-cs-nested-profile=0 --skip-symbolization
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-UNSYM-CTX-DEPTH
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-UNSYM-CTX-DEPTH-PROF
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csprof-max-context-depth=0  --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
+
+
+; CHECK-MAX-CTX-DEPTH: [fb]:19:6
+; CHECK-MAX-CTX-DEPTH:  1: 6
+; CHECK-MAX-CTX-DEPTH:  2: 3
+; CHECK-MAX-CTX-DEPTH:  3: 3
+; CHECK-MAX-CTX-DEPTH:  4: 0
+; CHECK-MAX-CTX-DEPTH:  5: 4 fb:4
+; CHECK-MAX-CTX-DEPTH:  6: 3 fa:3
+; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563022570642068
+; CHECK-MAX-CTX-DEPTH: [fa]:14:4
+; CHECK-MAX-CTX-DEPTH:  1: 4
+; CHECK-MAX-CTX-DEPTH:  3: 4
+; CHECK-MAX-CTX-DEPTH:  4: 2
+; CHECK-MAX-CTX-DEPTH:  5: 1
+; CHECK-MAX-CTX-DEPTH:  6: 0
+; CHECK-MAX-CTX-DEPTH:  7: 2 fb:2
+; CHECK-MAX-CTX-DEPTH:  8: 1 fa:1
+; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563070469352221
+
+
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7ab @ 0x7ab]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7ab:3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7ab->7a0:4
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7ab @ 0x7b5]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7d4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7d4->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7b5 @ 0x7d4]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7cd:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7db-7e0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7cd->7db:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7e0->7a0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7b5 @ 0x7e0]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7d4 @ 0x7e0]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7e0 @ 0x7b5]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7cd:2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7db-7e0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7cd->7db:2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7e0->7a0:1
+
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:5 @ fb:5 @ fb]:13:4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 3
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 4 fb:4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fa:7 @ fb:6 @ fa]:6:2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 1 fb:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:5 @ fb:6 @ fa]:4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:6 @ fa:8 @ fa]:4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 1 fb:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fa:8 @ fa:7 @ fb]:3:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:6 @ fa:7 @ fb]:3:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
index c673028..b8e3e24 100644
--- a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
@@ -9,9 +9,6 @@
 ; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER
 ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe-nommap.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --profile-summary-hot-count=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
 ; RUN: FileCheck %s --input-file %t
-; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
-; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
-
 
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa]:4:1
 ; CHECK-UNCOMPRESS:  1: 1
@@ -68,23 +65,6 @@
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:1:0
 ; CHECK-UNCOMPRESS:  5: 1 fb:1
 ; CHECK-UNCOMPRESS:  !CFGChecksum: 563022570642068
-; CHECK-MAX-CTX-DEPTH: [fb]:19:6
-; CHECK-MAX-CTX-DEPTH:  1: 6
-; CHECK-MAX-CTX-DEPTH:  2: 3
-; CHECK-MAX-CTX-DEPTH:  3: 3
-; CHECK-MAX-CTX-DEPTH:  4: 0
-; CHECK-MAX-CTX-DEPTH:  5: 4 fb:4
-; CHECK-MAX-CTX-DEPTH:  6: 3 fa:3
-; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563022570642068
-; CHECK-MAX-CTX-DEPTH: [fa]:14:4
-; CHECK-MAX-CTX-DEPTH:  1: 4
-; CHECK-MAX-CTX-DEPTH:  3: 4
-; CHECK-MAX-CTX-DEPTH:  4: 2
-; CHECK-MAX-CTX-DEPTH:  5: 1
-; CHECK-MAX-CTX-DEPTH:  6: 0
-; CHECK-MAX-CTX-DEPTH:  7: 2 fb:2
-; CHECK-MAX-CTX-DEPTH:  8: 1 fa:1
-; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563070469352221
 
 
 ; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4
diff --git a/llvm/test/tools/llvm-xray/X86/account-empty-stack-error.yaml b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-empty-stack-error.yaml
index d02d070a..d02d070a 100644
--- a/llvm/test/tools/llvm-xray/X86/account-empty-stack-error.yaml
+++ b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-empty-stack-error.yaml
diff --git a/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml
new file mode 100644
index 0000000..7233110
--- /dev/null
+++ b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml
@@ -0,0 +1,31 @@
+#RUN: not llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -d 2>&1 | FileCheck %s
+#RUN: llvm-xray account %s -k -o - -m %S/Inputs/simple-instrmap.yaml -d 2>&1 | FileCheck %s --check-prefix=KEEPGOING
+
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 0
+records:
+# An exit record doesn't match an entry record on a non empty stack with sibling call deduction. 
+# This can happen for example when an instrumented function does a 'fork()', 
+# where the child process will not see
+# the entry record but see the exit record. This is completely valid data,
+# which should be handled with grace (i.e. we treat it as an error, but since
+# the llvm-xray account tool has an option to keep going, gives the user a
+# chance to retry).
+  - { type: 0, func-id: 1, cpu: 1, thread: 1, kind: function-enter, tsc: 10000 }
+  - { type: 0, func-id: 4, cpu: 1, thread: 1, kind: function-exit,  tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 1, kind: function-exit,  tsc: 10002 }
+...
+
+#CHECK:      Error processing record: {{.*}}
+#CHECK-NEXT: Thread ID: 1
+#CHECK-NEXT:   #1 @(1)
+#CHECK-NEXT: llvm-xray: Failed accounting function calls in file '{{.*}}'.
+
+#KEEPGOING:      Error processing record: {{.*}}
+#KEEPGOING-NEXT: Thread ID: 1
+#KEEPGOING-NEXT:   #1 @(1)
diff --git a/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml b/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
index 76011ee..fb1a8f4 100644
--- a/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
+++ b/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
@@ -7,8 +7,8 @@ header:
   nonstop-tsc: true
   cycle-frequency: 0
 records:
-# We want to test the case for when we see spurious exits, but keep going
-# anyway ignoring the records in the process.
+# We want to test the case for when we see spurious exits without sibling call deduction, 
+# but keep going anyway ignoring the records in the process.
   - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 }
   - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
   - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }